From e311aab85a49e4c2605dee949c22dd5931c09846 Mon Sep 17 00:00:00 2001 From: Rob Colbert Date: Tue, 11 Mar 2025 03:45:24 -0400 Subject: [PATCH] project created --- .gitignore | 1 + LICENSE | 13 ++ README.md | 26 ++++ dist/cleantext.js | 46 +++++++ dist/cleantext.js.map | 1 + dist/lib/edit-with-vi.js | 65 +++++++++ dist/lib/edit-with-vi.js.map | 1 + dist/unzalgo.js | 117 +++++++++++++++++ dist/unzalgo.js.map | 1 + index.js | 1 + package.json | 38 ++++++ pnpm-lock.yaml | 248 +++++++++++++++++++++++++++++++++++ src/cleantext.ts | 54 ++++++++ src/lib/edit-with-vi.ts | 77 +++++++++++ src/unzalgo.ts | 138 +++++++++++++++++++ tsconfig.json | 37 ++++++ types/dtp-cleantext.d.ts | 4 + 17 files changed, 868 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 dist/cleantext.js create mode 100644 dist/cleantext.js.map create mode 100644 dist/lib/edit-with-vi.js create mode 100644 dist/lib/edit-with-vi.js.map create mode 100644 dist/unzalgo.js create mode 100644 dist/unzalgo.js.map create mode 100644 index.js create mode 100644 package.json create mode 100644 pnpm-lock.yaml create mode 100644 src/cleantext.ts create mode 100644 src/lib/edit-with-vi.ts create mode 100644 src/unzalgo.ts create mode 100644 tsconfig.json create mode 100644 types/dtp-cleantext.d.ts diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b512c09 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +node_modules \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..ffb446a --- /dev/null +++ b/LICENSE @@ -0,0 +1,13 @@ +Copyright 2025 DTP Technologies, LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..993076d --- /dev/null +++ b/README.md @@ -0,0 +1,26 @@ +# DTP CleanText + +We hate "hackers" and chucklefucks _this_ much. For real for real. Cap. + +## cleanText + +Performs an unzalgo and striptags on the input text. + +## Usage + +```javascript +var DTP = require("dtp-cleantext"); +const clean = DTP.cleanText(input); +``` + +## filterText + +A more comprehensive and complete filtering of input text that includes the filtering of nonsense, guff, HTML. Then, it performs a shoetest simplification, and removes all diacritics. It finishes with a call to `cleanText` for convenience. + +## Usage + +```javascript +var DTP = require("dtp-cleantext"); + +const filtered = DTP.filterText(input, options); +``` diff --git a/dist/cleantext.js b/dist/cleantext.js new file mode 100644 index 0000000..f3cb206 --- /dev/null +++ b/dist/cleantext.js @@ -0,0 +1,46 @@ +// cleantext.ts +// Copyright (C) DTP Technologies, LLC +// License: Apache-2.0 +import { createRequire } from "module"; +const require = createRequire(import.meta.url); // jshint ignore:line +import WebTextFilter from "./lib/edit-with-vi.js"; +import { clean } from "./unzalgo.js"; +import striptags from "striptags"; +import diacritics from "diacritics"; +const shoetest = require("shoetest"); +/** + * Basic text cleaning function to remove Zalgo and tags. + * @param text string The text to be cleaned + * @returns The cleaned text + */ +export function cleanText(text) { + text = clean(text); + text = striptags(text.trim()); + return text; +} +/** + * The heavy hammer of text filtering that removes all malicious and annoying + * things I know about as of this writing. Zalgo, tags, shoetest, diacritics, + * and our own custom nonsense UTF-8 and Unicode filters. + * + * This filter is very heavy-handed and merciless. + * + * @param text string The text to be filtered + * @returns The filtered text + */ +export function filterText(text) { + if (!text || typeof text !== "string" || text.length < 1) { + return text; + } + text = WebTextFilter.filterNonsense(text); + text = WebTextFilter.filterGuff(text); + text = WebTextFilter.filterHtml(text); + text = shoetest.simplify(text); + text = diacritics.remove(text); + /* + * Once all the stupidity has been stripped, strip the HTML + * tags that might remain. + */ + return clean(text); +} +//# sourceMappingURL=cleantext.js.map \ No newline at end of file diff --git a/dist/cleantext.js.map b/dist/cleantext.js.map new file mode 100644 index 0000000..a096bd4 --- /dev/null +++ b/dist/cleantext.js.map @@ -0,0 +1 @@ +{"version":3,"file":"cleantext.js","sourceRoot":"./src/","sources":["cleantext.ts"],"names":[],"mappings":"AAAA,eAAe;AACf,sCAAsC;AACtC,sBAAsB;AAEtB,OAAO,EAAE,aAAa,EAAE,MAAM,QAAQ,CAAC;AACvC,MAAM,OAAO,GAAG,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,qBAAqB;AAErE,OAAO,aAAa,MAAM,uBAAuB,CAAC;AAClD,OAAO,EAAE,KAAK,EAAE,MAAM,cAAc,CAAC;AAErC,OAAO,SAAS,MAAM,WAAW,CAAC;AAClC,OAAO,UAAU,MAAM,YAAY,CAAC;AAEpC,MAAM,QAAQ,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;AAErC;;;;GAIG;AACH,MAAM,UAAU,SAAS,CAAC,IAAY;IACpC,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,CAAC;IACnB,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC;IAC9B,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;;;;;;;GASG;AACH,MAAM,UAAU,UAAU,CAAC,IAAY;IACrC,IAAI,CAAC,IAAI,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACzD,OAAO,IAAI,CAAC;IACd,CAAC;IAED,IAAI,GAAG,aAAa,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;IAC1C,IAAI,GAAG,aAAa,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;IACtC,IAAI,GAAG,aAAa,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;IAEtC,IAAI,GAAG,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC/B,IAAI,GAAG,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAE/B;;;OAGG;IACH,OAAO,KAAK,CAAC,IAAI,CAAC,CAAC;AACrB,CAAC"} \ No newline at end of file diff --git a/dist/lib/edit-with-vi.js b/dist/lib/edit-with-vi.js new file mode 100644 index 0000000..32c11a0 --- /dev/null +++ b/dist/lib/edit-with-vi.js @@ -0,0 +1,65 @@ +// edit-with-vi.ts +// Copyright (C) 2025 DTP Technologies, LLC +// All Rights Reserved +// Based on: +// https://github.com/voidful/text-filtering-js/blob/master/text_filtering.js +// - Does not extend String because stop it. +// - CommonJS module +'use strict'; +/* + * This file must only be edited with vi/vim. If you so much as *open* this file + * in VSCode, you've probably damaged the file. Do not save it. Just close it, + * and go edit the file with vi or vim. + * + * VS Code, being web-based, contains logic to filter out the content used to + * implement the filter. You will erase that content, and then various attackers + * will own your chat. + * + * If attackers have owned your chat, you may want to revert or otherwise restore + * this file to it's original state. + */ +export function filterBBcode(text) { + return text.replace(/\[.*\]/g, ''); +} +export function filterLineBreak(text) { + return text.replace(/(\r\n|\n|\r)/gm, " "); +} +export function filterSmileysCode(text) { + return text + .replace(/:\$?.*:\$?/g, '') + .replace(/:\w+:?/g, '') + .replace(/:\w+/g, '') + .replace(/&#.*;/g, ''); +} +export function filterGuff(text) { + return text.replace('*** 作者被禁止或刪除 內容自動屏蔽 ***', ''); +} +export function filterHtml(text) { + return text.replace(/(<[^>]*>)/g, ' '); +} +export function filterNonsense(text) { + // edited to allow CR and LF + // text = text.replace(/[\u0000-\u001F\u007f\u00AD\u200B-\u200D\u3000\uFEFF]/g,''); + text = text.replace(/[\u0000-\u0009\u000b\u000c\u000e\u007f\u00AD\u200B-\u200D\u3000\uFEFF]/g, ''); + text = text.replace(/\u00AD/, ' '); + text = text.replace(/\u2013/, '-'); + return text; +} +export function filterAll(text) { + text = module.exports.filterSmileysCode(text); + text = module.exports.filterBBcode(text); + text = module.exports.filterGuff(text); + text = module.exports.filterHtml(text); + text = module.exports.filterLineBreak(text); + return text; +} +export default { + filterBBcode, + filterLineBreak, + filterSmileysCode, + filterGuff, + filterHtml, + filterNonsense, + filterAll, +}; +//# sourceMappingURL=edit-with-vi.js.map \ No newline at end of file diff --git a/dist/lib/edit-with-vi.js.map b/dist/lib/edit-with-vi.js.map new file mode 100644 index 0000000..8f42cb4 --- /dev/null +++ b/dist/lib/edit-with-vi.js.map @@ -0,0 +1 @@ +{"version":3,"file":"edit-with-vi.js","sourceRoot":"./src/","sources":["lib/edit-with-vi.ts"],"names":[],"mappings":"AAAA,kBAAkB;AAClB,2CAA2C;AAC3C,sBAAsB;AAEtB,YAAY;AACZ,6EAA6E;AAC7E,4CAA4C;AAC5C,oBAAoB;AAEpB,YAAY,CAAC;AAEb;;;;;;;;;;;GAWG;AAEH,MAAM,UAAU,YAAY,CAAE,IAAY;IACxC,OAAO,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;AACrC,CAAC;AAED,MAAM,UAAU,eAAe,CAAE,IAAY;IAC3C,OAAO,IAAI,CAAC,OAAO,CAAC,gBAAgB,EAAC,GAAG,CAAC,CAAC;AAC5C,CAAC;AAED,MAAM,UAAU,iBAAiB,CAAE,IAAY;IAC7C,OAAO,IAAI;SACR,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC;SAC1B,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC;SACtB,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC;SACpB,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CACvB;AACH,CAAC;AAED,MAAM,UAAU,UAAU,CAAE,IAAY;IACtC,OAAO,IAAI,CAAC,OAAO,CAAC,yBAAyB,EAAE,EAAE,CAAC,CAAC;AACrD,CAAC;AAED,MAAM,UAAU,UAAU,CAAE,IAAY;IACtC,OAAO,IAAI,CAAC,OAAO,CAAC,YAAY,EAAC,GAAG,CAAC,CAAC;AACxC,CAAC;AAED,MAAM,UAAU,cAAc,CAAE,IAAY;IAC1C,4BAA4B;IAC5B,mFAAmF;IACnF,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,yEAAyE,EAAC,EAAE,CAAC,CAAC;IAElG,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAC,GAAG,CAAC,CAAC;IAClC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAC,GAAG,CAAC,CAAC;IAClC,OAAO,IAAI,CAAC;AACd,CAAC;AAED,MAAM,UAAU,SAAS,CAAE,IAAY;IACrC,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC;IAC9C,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;IACzC,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;IACvC,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;IACvC,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC;IAC5C,OAAO,IAAI,CAAC;AACd,CAAC;AAED,eAAe;IACb,YAAY;IACZ,eAAe;IACf,iBAAiB;IACjB,UAAU;IACV,UAAU;IACV,cAAc;IACd,SAAS;CACV,CAAC"} \ No newline at end of file diff --git a/dist/unzalgo.js b/dist/unzalgo.js new file mode 100644 index 0000000..ec950d6 --- /dev/null +++ b/dist/unzalgo.js @@ -0,0 +1,117 @@ +// unzalgo.ts +// Copyright (C) DTP Technologies, LLC +// License: Apache-2.0 +"use strict"; +import { percentile } from "stats-lite"; +const categories = /[\p{Mn}\p{Me}]+/u; +const DEFAULT_DETECTION_THRESHOLD = 0.55; +const DEFAULT_TARGET_DENSITY = 0; +const compose = (text) => text.normalize("NFC"); +const decompose = (text) => text.normalize("NFD"); +const computeZalgoDensity = (text) => [...text].filter((character) => categories.test(character)).length / + Math.max(text.length, 1); +const clamp = (x) => Math.max(Math.min(x, 1), 0); +/** + * Computes a score ∈ [0, 1] for every word in the input string. Each score + * represents the ratio of combining characters to total characters in a word. + * + * @param text string The input string for which to compute scores. + * @return Array An array of scores where each score describes the + * Zalgo ratio of a word. + */ +export function computeScores(text) { + const wordScores = []; + /** + * Trimming here allows us to return early. + * Without trimming, we risk dividing by `0` later when computing the score. + */ + if (!text.trim().length) { + wordScores.push(0); + } + else { + for (const word of decompose(text).split(/\s+/)) { + let banned = 0; + for (const character of word) { + if (categories.test(character)) { + ++banned; + } + } + const score = banned / word.length; + wordScores.push(score); + } + } + return wordScores; +} +/** + * Determines if the string consists of Zalgo text. Note that the occurrence + * of a combining character is not enough to trigger the detection. Instead, + * it computes a ratio for the input string and checks if it exceeds a given + * threshold. Thus, internationalized strings aren't automatically classified + * as Zalgo text. + * + * @param text string A string for which a Zalgo text check is run. + * @param detectionThreshold number A threshold ∈ [0, 1]. The higher the + * threshold, the more combining characters are needed for it to be detected + * as Zalgo text. + * @return boolean Whether the string is a Zalgo text string. + */ +export function isZalgo(text, detectionThreshold = DEFAULT_DETECTION_THRESHOLD) { + const wordScores = computeScores(text); + const totalScore = percentile(wordScores, 0.75); + return totalScore >= clamp(detectionThreshold); +} +/** + * Removes all combining characters for every word in a string if the word is + * classified as Zalgo text. + * + * If `targetDensity` is specified, not all the Zalgo characters will be + * removed. Instead, they will be thinned out uniformly. + * + * @param text string + * A string for which combining characters are removed for every word whose + * Zalgo property is met. + * @param options object Options for cleaning. + * @param options.detectionThreshold number + * A threshold ∈ [0, 1]. The higher the threshold, the more combining + * characters are needed for it to be detected as Zalgo text. + * @param options.targetDensity number + * A threshold ∈ [0, 1]. The higher the density, the more Zalgo characters + * will be part of the resulting string. The result is guaranteed to have a + * Zalgo-character density that is less than or equal to the one provided. + * @return string + * A cleaned, more readable string. + */ +export function clean(text, { detectionThreshold = DEFAULT_DETECTION_THRESHOLD, targetDensity = DEFAULT_TARGET_DENSITY, } = {}) { + let cleaned = ""; + const effectiveTargetDensity = clamp(targetDensity); + for (const word of decompose(text).split(/(\s+)/)) { + if (isZalgo(word, detectionThreshold)) { + let cleanedWord = ""; + const letters = [...word].map((character) => { + return { + character, + isCandidate: categories.test(character), + }; + }); + for (let i = 0; i < letters.length; ++i) { + const { character, isCandidate } = letters[i]; + if (isCandidate) { + const admissionProjection = cleanedWord + word.substring(i); + const omissionProjection = cleanedWord + word.substring(i + 1); + const admissionDistance = effectiveTargetDensity - computeZalgoDensity(admissionProjection); + const omissionDistance = effectiveTargetDensity - computeZalgoDensity(omissionProjection); + if (Math.abs(omissionDistance) <= Math.abs(admissionDistance)) { + continue; + } + } + cleanedWord += character; + } + cleaned += cleanedWord; + } + else { + cleaned += word; + } + } + return compose(cleaned); +} +//# sourceMappingURL=unzalgo.js.map \ No newline at end of file diff --git a/dist/unzalgo.js.map b/dist/unzalgo.js.map new file mode 100644 index 0000000..8033bd0 --- /dev/null +++ b/dist/unzalgo.js.map @@ -0,0 +1 @@ +{"version":3,"file":"unzalgo.js","sourceRoot":"./src/","sources":["unzalgo.ts"],"names":[],"mappings":"AAAA,aAAa;AACb,sCAAsC;AACtC,sBAAsB;AAEtB,YAAY,CAAC;AAEb,OAAO,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AAExC,MAAM,UAAU,GAAG,kBAAkB,CAAC;AACtC,MAAM,2BAA2B,GAAG,IAAI,CAAC;AACzC,MAAM,sBAAsB,GAAG,CAAC,CAAC;AACjC,MAAM,OAAO,GAAG,CAAC,IAAY,EAAE,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;AACxD,MAAM,SAAS,GAAG,CAAC,IAAY,EAAE,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;AAC1D,MAAM,mBAAmB,GAAG,CAAC,IAAY,EAAE,EAAE,CAC3C,CAAC,GAAG,IAAI,CAAC,CAAC,MAAM,CAAC,CAAC,SAAS,EAAE,EAAE,CAAC,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,MAAM;IAClE,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;AAC3B,MAAM,KAAK,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;AAEzD;;;;;;;GAOG;AACH,MAAM,UAAU,aAAa,CAAC,IAAY;IACxC,MAAM,UAAU,GAAkB,EAAE,CAAC;IACrC;;;OAGG;IACH,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,EAAE,CAAC;QACxB,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACrB,CAAC;SAAM,CAAC;QACN,KAAK,MAAM,IAAI,IAAI,SAAS,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;YAChD,IAAI,MAAM,GAAG,CAAC,CAAC;YACf,KAAK,MAAM,SAAS,IAAI,IAAI,EAAE,CAAC;gBAC7B,IAAI,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;oBAC/B,EAAE,MAAM,CAAC;gBACX,CAAC;YACH,CAAC;YACD,MAAM,KAAK,GAAG,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC;YACnC,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACzB,CAAC;IACH,CAAC;IACD,OAAO,UAAU,CAAC;AACpB,CAAC;AAED;;;;;;;;;;;;GAYG;AACH,MAAM,UAAU,OAAO,CACrB,IAAY,EACZ,qBAA6B,2BAA2B;IAExD,MAAM,UAAU,GAAG,aAAa,CAAC,IAAI,CAAC,CAAC;IACvC,MAAM,UAAU,GAAG,UAAU,CAAC,UAAU,EAAE,IAAI,CAAC,CAAC;IAChD,OAAO,UAAU,IAAI,KAAK,CAAC,kBAAkB,CAAC,CAAC;AACjD,CAAC;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,MAAM,UAAU,KAAK,CACnB,IAAY,EACZ,EACE,kBAAkB,GAAG,2BAA2B,EAChD,aAAa,GAAG,sBAAsB,GACvC,GAAG,EAAE;IAEN,IAAI,OAAO,GAAG,EAAE,CAAC;IACjB,MAAM,sBAAsB,GAAG,KAAK,CAAC,aAAa,CAAC,CAAC;IAOpD,KAAK,MAAM,IAAI,IAAI,SAAS,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,EAAE,CAAC;QAClD,IAAI,OAAO,CAAC,IAAI,EAAE,kBAAkB,CAAC,EAAE,CAAC;YACtC,IAAI,WAAW,GAAG,EAAE,CAAC;YACrB,MAAM,OAAO,GAAG,CAAC,GAAG,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,SAAS,EAAE,EAAE;gBAC1C,OAAO;oBACL,SAAS;oBACT,WAAW,EAAE,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC;iBACxC,CAAC;YACJ,CAAC,CAAC,CAAC;YACH,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,EAAE,CAAC;gBACxC,MAAM,EAAE,SAAS,EAAE,WAAW,EAAE,GAAG,OAAO,CAAC,CAAC,CAAoB,CAAC;gBACjE,IAAI,WAAW,EAAE,CAAC;oBAChB,MAAM,mBAAmB,GAAG,WAAW,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC;oBAC5D,MAAM,kBAAkB,GAAG,WAAW,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;oBAC/D,MAAM,iBAAiB,GACrB,sBAAsB,GAAG,mBAAmB,CAAC,mBAAmB,CAAC,CAAC;oBACpE,MAAM,gBAAgB,GACpB,sBAAsB,GAAG,mBAAmB,CAAC,kBAAkB,CAAC,CAAC;oBACnE,IAAI,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,IAAI,IAAI,CAAC,GAAG,CAAC,iBAAiB,CAAC,EAAE,CAAC;wBAC9D,SAAS;oBACX,CAAC;gBACH,CAAC;gBACD,WAAW,IAAI,SAAS,CAAC;YAC3B,CAAC;YACD,OAAO,IAAI,WAAW,CAAC;QACzB,CAAC;aAAM,CAAC;YACN,OAAO,IAAI,IAAI,CAAC;QAClB,CAAC;IACH,CAAC;IACD,OAAO,OAAO,CAAC,OAAO,CAAC,CAAC;AAC1B,CAAC"} \ No newline at end of file diff --git a/index.js b/index.js new file mode 100644 index 0000000..f39fa87 --- /dev/null +++ b/index.js @@ -0,0 +1 @@ +export { cleanText, filterText } from "./dist/cleantext.js"; diff --git a/package.json b/package.json new file mode 100644 index 0000000..7583800 --- /dev/null +++ b/package.json @@ -0,0 +1,38 @@ +{ + "name": "dtp-cleantext", + "version": "1.0.0", + "description": "Text filtering and safety to make garbage people very sad when they can't hack your node.", + "type": "module", + "main": "index.js", + "scripts": { + "build": "tsc", + "test": "echo \"Error: no test specified\" && exit 1" + }, + "types": "./types/dtp-cleantext.d.ts", + "keywords": [ + "xss", + "sanitize", + "filter", + "clean" + ], + "author": { + "name": "Rob Colbert", + "email": "rob@digitaltelepresence.com", + "url": "https://digitaltelepresence.com/" + }, + "license": "ISC", + "packageManager": "pnpm@10.6.1", + "dependencies": { + "diacritics": "^1.3.0", + "shoetest": "^1.2.2", + "stats-lite": "^2.2.0", + "striptags": "^3.2.0" + }, + "devDependencies": { + "@types/diacritics": "^1.3.3", + "@types/node": "^22.13.10", + "@types/stats-lite": "^2.2.2", + "ts-node": "^10.9.2", + "typescript": "^5.8.2" + } +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml new file mode 100644 index 0000000..e0d6586 --- /dev/null +++ b/pnpm-lock.yaml @@ -0,0 +1,248 @@ +lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + excludeLinksFromLockfile: false + +importers: + + .: + dependencies: + diacritics: + specifier: ^1.3.0 + version: 1.3.0 + shoetest: + specifier: ^1.2.2 + version: 1.2.2 + stats-lite: + specifier: ^2.2.0 + version: 2.2.0 + striptags: + specifier: ^3.2.0 + version: 3.2.0 + devDependencies: + '@types/diacritics': + specifier: ^1.3.3 + version: 1.3.3 + '@types/node': + specifier: ^22.13.10 + version: 22.13.10 + '@types/stats-lite': + specifier: ^2.2.2 + version: 2.2.2 + ts-node: + specifier: ^10.9.2 + version: 10.9.2(@types/node@22.13.10)(typescript@5.8.2) + typescript: + specifier: ^5.8.2 + version: 5.8.2 + +packages: + + '@cspotcode/source-map-support@0.8.1': + resolution: {integrity: sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==} + engines: {node: '>=12'} + + '@jridgewell/resolve-uri@3.1.2': + resolution: {integrity: sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==} + engines: {node: '>=6.0.0'} + + '@jridgewell/sourcemap-codec@1.5.0': + resolution: {integrity: sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==} + + '@jridgewell/trace-mapping@0.3.9': + resolution: {integrity: sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==} + + '@tsconfig/node10@1.0.11': + resolution: {integrity: sha512-DcRjDCujK/kCk/cUe8Xz8ZSpm8mS3mNNpta+jGCA6USEDfktlNvm1+IuZ9eTcDbNk41BHwpHHeW+N1lKCz4zOw==} + + '@tsconfig/node12@1.0.11': + resolution: {integrity: sha512-cqefuRsh12pWyGsIoBKJA9luFu3mRxCA+ORZvA4ktLSzIuCUtWVxGIuXigEwO5/ywWFMZ2QEGKWvkZG1zDMTag==} + + '@tsconfig/node14@1.0.3': + resolution: {integrity: sha512-ysT8mhdixWK6Hw3i1V2AeRqZ5WfXg1G43mqoYlM2nc6388Fq5jcXyr5mRsqViLx/GJYdoL0bfXD8nmF+Zn/Iow==} + + '@tsconfig/node16@1.0.4': + resolution: {integrity: sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==} + + '@types/diacritics@1.3.3': + resolution: {integrity: sha512-wt0tBItmBsOUVZ8+MCrkBMoVfH/EUZeTXwYSekVVYilZlGDYssREUR+sX72mHvl2IrbdCKgpYARXKh3awD2how==} + + '@types/node@22.13.10': + resolution: {integrity: sha512-I6LPUvlRH+O6VRUqYOcMudhaIdUVWfsjnZavnsraHvpBwaEyMN29ry+0UVJhImYL16xsscu0aske3yA+uPOWfw==} + + '@types/stats-lite@2.2.2': + resolution: {integrity: sha512-T+bzT53cbPbE0hMlCNZux1QuH6hQFNHIwRMTQCu3YPG0W7XUfeoULHl+TehJCjaxQx8cz4wlg5oQsOyG9LvZmA==} + + acorn-walk@8.3.4: + resolution: {integrity: sha512-ueEepnujpqee2o5aIYnvHU6C0A42MNdsIDeqy5BydrkuC5R1ZuUFnm27EeFJGoEHJQgn3uleRvmTXaJgfXbt4g==} + engines: {node: '>=0.4.0'} + + acorn@8.14.1: + resolution: {integrity: sha512-OvQ/2pUDKmgfCg++xsTX1wGxfTaszcHVcTctW4UJB4hibJx2HXxxO5UmVgyjMa+ZDsiaf5wWLXYpRWMmBI0QHg==} + engines: {node: '>=0.4.0'} + hasBin: true + + arg@4.1.3: + resolution: {integrity: sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==} + + create-require@1.1.1: + resolution: {integrity: sha512-dcKFX3jn0MpIaXjisoRvexIJVEKzaq7z2rZKxf+MSr9TkdmHmsU4m2lcLojrj/FHl8mk5VxMmYA+ftRkP/3oKQ==} + + diacritics@1.3.0: + resolution: {integrity: sha512-wlwEkqcsaxvPJML+rDh/2iS824jbREk6DUMUKkEaSlxdYHeS43cClJtsWglvw2RfeXGm6ohKDqsXteJ5sP5enA==} + + diff@4.0.2: + resolution: {integrity: sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A==} + engines: {node: '>=0.3.1'} + + drange@1.1.1: + resolution: {integrity: sha512-pYxfDYpued//QpnLIm4Avk7rsNtAtQkUES2cwAYSvD/wd2pKD71gN2Ebj3e7klzXwjocvE8c5vx/1fxwpqmSxA==} + engines: {node: '>=4'} + + isnumber@1.0.0: + resolution: {integrity: sha512-JLiSz/zsZcGFXPrB4I/AGBvtStkt+8QmksyZBZnVXnnK9XdTEyz0tX8CRYljtwYDuIuZzih6DpHQdi+3Q6zHPw==} + + make-error@1.3.6: + resolution: {integrity: sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw==} + + randexp@0.5.3: + resolution: {integrity: sha512-U+5l2KrcMNOUPYvazA3h5ekF80FHTUG+87SEAmHZmolh1M+i/WyTCxVzmi+tidIa1tM4BSe8g2Y/D3loWDjj+w==} + engines: {node: '>=4'} + + ret@0.2.2: + resolution: {integrity: sha512-M0b3YWQs7R3Z917WRQy1HHA7Ba7D8hvZg6UE5mLykJxQVE2ju0IXbGlaHPPlkY+WN7wFP+wUMXmBFA0aV6vYGQ==} + engines: {node: '>=4'} + + shoetest@1.2.2: + resolution: {integrity: sha512-iT8kIEFcGfUwo53VUFckm+glTkc0oLycRe+YqU/W4wQuIHGIWc5KMIpDnJVdavKCyEZKQTi8IDq27rDmB09QjA==} + + stats-lite@2.2.0: + resolution: {integrity: sha512-/Kz55rgUIv2KP2MKphwYT/NCuSfAlbbMRv2ZWw7wyXayu230zdtzhxxuXXcvsc6EmmhS8bSJl3uS1wmMHFumbA==} + engines: {node: '>=2.0.0'} + + striptags@3.2.0: + resolution: {integrity: sha512-g45ZOGzHDMe2bdYMdIvdAfCQkCTDMGBazSw1ypMowwGIee7ZQ5dU0rBJ8Jqgl+jAKIv4dbeE1jscZq9wid1Tkw==} + + ts-node@10.9.2: + resolution: {integrity: sha512-f0FFpIdcHgn8zcPSbf1dRevwt047YMnaiJM3u2w2RewrB+fob/zePZcrOyQoLMMO7aBIddLcQIEK5dYjkLnGrQ==} + hasBin: true + peerDependencies: + '@swc/core': '>=1.2.50' + '@swc/wasm': '>=1.2.50' + '@types/node': '*' + typescript: '>=2.7' + peerDependenciesMeta: + '@swc/core': + optional: true + '@swc/wasm': + optional: true + + typescript@5.8.2: + resolution: {integrity: sha512-aJn6wq13/afZp/jT9QZmwEjDqqvSGp1VT5GVg+f/t6/oVyrgXM6BY1h9BRh/O5p3PlUPAe+WuiEZOmb/49RqoQ==} + engines: {node: '>=14.17'} + hasBin: true + + undici-types@6.20.0: + resolution: {integrity: sha512-Ny6QZ2Nju20vw1SRHe3d9jVu6gJ+4e3+MMpqu7pqE5HT6WsTSlce++GQmK5UXS8mzV8DSYHrQH+Xrf2jVcuKNg==} + + v8-compile-cache-lib@3.0.1: + resolution: {integrity: sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==} + + yn@3.1.1: + resolution: {integrity: sha512-Ux4ygGWsu2c7isFWe8Yu1YluJmqVhxqK2cLXNQA5AcC3QfbGNpM7fu0Y8b/z16pXLnFxZYvWhd3fhBY9DLmC6Q==} + engines: {node: '>=6'} + +snapshots: + + '@cspotcode/source-map-support@0.8.1': + dependencies: + '@jridgewell/trace-mapping': 0.3.9 + + '@jridgewell/resolve-uri@3.1.2': {} + + '@jridgewell/sourcemap-codec@1.5.0': {} + + '@jridgewell/trace-mapping@0.3.9': + dependencies: + '@jridgewell/resolve-uri': 3.1.2 + '@jridgewell/sourcemap-codec': 1.5.0 + + '@tsconfig/node10@1.0.11': {} + + '@tsconfig/node12@1.0.11': {} + + '@tsconfig/node14@1.0.3': {} + + '@tsconfig/node16@1.0.4': {} + + '@types/diacritics@1.3.3': {} + + '@types/node@22.13.10': + dependencies: + undici-types: 6.20.0 + + '@types/stats-lite@2.2.2': {} + + acorn-walk@8.3.4: + dependencies: + acorn: 8.14.1 + + acorn@8.14.1: {} + + arg@4.1.3: {} + + create-require@1.1.1: {} + + diacritics@1.3.0: {} + + diff@4.0.2: {} + + drange@1.1.1: {} + + isnumber@1.0.0: {} + + make-error@1.3.6: {} + + randexp@0.5.3: + dependencies: + drange: 1.1.1 + ret: 0.2.2 + + ret@0.2.2: {} + + shoetest@1.2.2: + dependencies: + randexp: 0.5.3 + + stats-lite@2.2.0: + dependencies: + isnumber: 1.0.0 + + striptags@3.2.0: {} + + ts-node@10.9.2(@types/node@22.13.10)(typescript@5.8.2): + dependencies: + '@cspotcode/source-map-support': 0.8.1 + '@tsconfig/node10': 1.0.11 + '@tsconfig/node12': 1.0.11 + '@tsconfig/node14': 1.0.3 + '@tsconfig/node16': 1.0.4 + '@types/node': 22.13.10 + acorn: 8.14.1 + acorn-walk: 8.3.4 + arg: 4.1.3 + create-require: 1.1.1 + diff: 4.0.2 + make-error: 1.3.6 + typescript: 5.8.2 + v8-compile-cache-lib: 3.0.1 + yn: 3.1.1 + + typescript@5.8.2: {} + + undici-types@6.20.0: {} + + v8-compile-cache-lib@3.0.1: {} + + yn@3.1.1: {} diff --git a/src/cleantext.ts b/src/cleantext.ts new file mode 100644 index 0000000..e83a953 --- /dev/null +++ b/src/cleantext.ts @@ -0,0 +1,54 @@ +// cleantext.ts +// Copyright (C) DTP Technologies, LLC +// License: Apache-2.0 + +import { createRequire } from "module"; +const require = createRequire(import.meta.url); // jshint ignore:line + +import WebTextFilter from "./lib/edit-with-vi.js"; +import { clean } from "./unzalgo.js"; + +import striptags from "striptags"; +import diacritics from "diacritics"; + +const shoetest = require("shoetest"); + +/** + * Basic text cleaning function to remove Zalgo and tags. + * @param text string The text to be cleaned + * @returns The cleaned text + */ +export function cleanText(text: string): string { + text = clean(text); + text = striptags(text.trim()); + return text; +} + +/** + * The heavy hammer of text filtering that removes all malicious and annoying + * things I know about as of this writing. Zalgo, tags, shoetest, diacritics, + * and our own custom nonsense UTF-8 and Unicode filters. + * + * This filter is very heavy-handed and merciless. + * + * @param text string The text to be filtered + * @returns The filtered text + */ +export function filterText(text: string): string { + if (!text || typeof text !== "string" || text.length < 1) { + return text; + } + + text = WebTextFilter.filterNonsense(text); + text = WebTextFilter.filterGuff(text); + text = WebTextFilter.filterHtml(text); + + text = shoetest.simplify(text); + text = diacritics.remove(text); + + /* + * Once all the stupidity has been stripped, strip the HTML + * tags that might remain. + */ + return clean(text); +} diff --git a/src/lib/edit-with-vi.ts b/src/lib/edit-with-vi.ts new file mode 100644 index 0000000..a8d6175 --- /dev/null +++ b/src/lib/edit-with-vi.ts @@ -0,0 +1,77 @@ +// edit-with-vi.ts +// Copyright (C) 2025 DTP Technologies, LLC +// All Rights Reserved + +// Based on: +// https://github.com/voidful/text-filtering-js/blob/master/text_filtering.js +// - Does not extend String because stop it. +// - CommonJS module + +'use strict'; + +/* + * This file must only be edited with vi/vim. If you so much as *open* this file + * in VSCode, you've probably damaged the file. Do not save it. Just close it, + * and go edit the file with vi or vim. + * + * VS Code, being web-based, contains logic to filter out the content used to + * implement the filter. You will erase that content, and then various attackers + * will own your chat. + * + * If attackers have owned your chat, you may want to revert or otherwise restore + * this file to it's original state. + */ + +export function filterBBcode (text: string) : string { + return text.replace(/\[.*\]/g, ''); +} + +export function filterLineBreak (text: string) : string { + return text.replace(/(\r\n|\n|\r)/gm," "); +} + +export function filterSmileysCode (text: string) : string { + return text + .replace(/:\$?.*:\$?/g, '') + .replace(/:\w+:?/g, '') + .replace(/:\w+/g, '') + .replace(/&#.*;/g, '') + ; +} + +export function filterGuff (text: string) : string { + return text.replace('*** 作者被禁止或刪除 內容自動屏蔽 ***', ''); +} + +export function filterHtml (text: string) : string { + return text.replace(/(<[^>]*>)/g,' '); +} + +export function filterNonsense (text: string) : string { + // edited to allow CR and LF + // text = text.replace(/[\u0000-\u001F\u007f\u00AD\u200B-\u200D\u3000\uFEFF]/g,''); + text = text.replace(/[\u0000-\u0009\u000b\u000c\u000e\u007f\u00AD\u200B-\u200D\u3000\uFEFF]/g,''); + + text = text.replace(/\u00AD/,' '); + text = text.replace(/\u2013/,'-'); + return text; +} + +export function filterAll (text: string) : string { + text = module.exports.filterSmileysCode(text); + text = module.exports.filterBBcode(text); + text = module.exports.filterGuff(text); + text = module.exports.filterHtml(text); + text = module.exports.filterLineBreak(text); + return text; +} + +export default { + filterBBcode, + filterLineBreak, + filterSmileysCode, + filterGuff, + filterHtml, + filterNonsense, + filterAll, +}; diff --git a/src/unzalgo.ts b/src/unzalgo.ts new file mode 100644 index 0000000..e8d9bdb --- /dev/null +++ b/src/unzalgo.ts @@ -0,0 +1,138 @@ +// unzalgo.ts +// Copyright (C) DTP Technologies, LLC +// License: Apache-2.0 + +"use strict"; + +import { percentile } from "stats-lite"; + +const categories = /[\p{Mn}\p{Me}]+/u; +const DEFAULT_DETECTION_THRESHOLD = 0.55; +const DEFAULT_TARGET_DENSITY = 0; +const compose = (text: string) => text.normalize("NFC"); +const decompose = (text: string) => text.normalize("NFD"); +const computeZalgoDensity = (text: string) => + [...text].filter((character) => categories.test(character)).length / + Math.max(text.length, 1); +const clamp = (x: number) => Math.max(Math.min(x, 1), 0); + +/** + * Computes a score ∈ [0, 1] for every word in the input string. Each score + * represents the ratio of combining characters to total characters in a word. + * + * @param text string The input string for which to compute scores. + * @return Array An array of scores where each score describes the + * Zalgo ratio of a word. + */ +export function computeScores(text: string): Array { + const wordScores: Array = []; + /** + * Trimming here allows us to return early. + * Without trimming, we risk dividing by `0` later when computing the score. + */ + if (!text.trim().length) { + wordScores.push(0); + } else { + for (const word of decompose(text).split(/\s+/)) { + let banned = 0; + for (const character of word) { + if (categories.test(character)) { + ++banned; + } + } + const score = banned / word.length; + wordScores.push(score); + } + } + return wordScores; +} + +/** + * Determines if the string consists of Zalgo text. Note that the occurrence + * of a combining character is not enough to trigger the detection. Instead, + * it computes a ratio for the input string and checks if it exceeds a given + * threshold. Thus, internationalized strings aren't automatically classified + * as Zalgo text. + * + * @param text string A string for which a Zalgo text check is run. + * @param detectionThreshold number A threshold ∈ [0, 1]. The higher the + * threshold, the more combining characters are needed for it to be detected + * as Zalgo text. + * @return boolean Whether the string is a Zalgo text string. + */ +export function isZalgo( + text: string, + detectionThreshold: number = DEFAULT_DETECTION_THRESHOLD +): boolean { + const wordScores = computeScores(text); + const totalScore = percentile(wordScores, 0.75); + return totalScore >= clamp(detectionThreshold); +} + +/** + * Removes all combining characters for every word in a string if the word is + * classified as Zalgo text. + * + * If `targetDensity` is specified, not all the Zalgo characters will be + * removed. Instead, they will be thinned out uniformly. + * + * @param text string + * A string for which combining characters are removed for every word whose + * Zalgo property is met. + * @param options object Options for cleaning. + * @param options.detectionThreshold number + * A threshold ∈ [0, 1]. The higher the threshold, the more combining + * characters are needed for it to be detected as Zalgo text. + * @param options.targetDensity number + * A threshold ∈ [0, 1]. The higher the density, the more Zalgo characters + * will be part of the resulting string. The result is guaranteed to have a + * Zalgo-character density that is less than or equal to the one provided. + * @return string + * A cleaned, more readable string. + */ +export function clean( + text: string, + { + detectionThreshold = DEFAULT_DETECTION_THRESHOLD, + targetDensity = DEFAULT_TARGET_DENSITY, + } = {} +) { + let cleaned = ""; + const effectiveTargetDensity = clamp(targetDensity); + + type CharacterRecord = { + character: string; + isCandidate: boolean; + }; + + for (const word of decompose(text).split(/(\s+)/)) { + if (isZalgo(word, detectionThreshold)) { + let cleanedWord = ""; + const letters = [...word].map((character) => { + return { + character, + isCandidate: categories.test(character), + }; + }); + for (let i = 0; i < letters.length; ++i) { + const { character, isCandidate } = letters[i] as CharacterRecord; + if (isCandidate) { + const admissionProjection = cleanedWord + word.substring(i); + const omissionProjection = cleanedWord + word.substring(i + 1); + const admissionDistance = + effectiveTargetDensity - computeZalgoDensity(admissionProjection); + const omissionDistance = + effectiveTargetDensity - computeZalgoDensity(omissionProjection); + if (Math.abs(omissionDistance) <= Math.abs(admissionDistance)) { + continue; + } + } + cleanedWord += character; + } + cleaned += cleanedWord; + } else { + cleaned += word; + } + } + return compose(cleaned); +} diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000..ee6613a --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,37 @@ +{ + "compilerOptions": { + "target": "ES2022", + "lib": ["es2022", "dom"], + "experimentalDecorators": true, + "emitDecoratorMetadata": true, + "module": "ESNext", + "rootDir": "./src", + "moduleResolution": "node", + "baseUrl": "./src", + "typeRoots": ["node_modules/@types", "types"], + "allowImportingTsExtensions": true, + "rewriteRelativeImportExtensions": true, + "resolveJsonModule": true, + "declaration": false, + "sourceMap": true, + "outDir": "dist", + "removeComments": false, + "importHelpers": true, + "sourceRoot": "./src", + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "strict": true, + "alwaysStrict": true, + "noUnusedLocals": true, + "noUnusedParameters": true, + "noImplicitReturns": true, + "noFallthroughCasesInSwitch": true, + "noUncheckedIndexedAccess": true, + "pretty": true, + "skipDefaultLibCheck": true, + "skipLibCheck": true + }, + "include": ["src/**/*.ts"], + "exclude": ["node_modules", "docs"], + "files": ["types/dtp-cleantext.d.ts"] +} diff --git a/types/dtp-cleantext.d.ts b/types/dtp-cleantext.d.ts new file mode 100644 index 0000000..c36c930 --- /dev/null +++ b/types/dtp-cleantext.d.ts @@ -0,0 +1,4 @@ +declare module "dtp-cleantext" { + export function cleanText(text: string): string; + export function filterText(text: string): string; +}