17 changed files with 868 additions and 0 deletions
@ -0,0 +1 @@ |
|||||
|
node_modules |
@ -0,0 +1,13 @@ |
|||||
|
Copyright 2025 DTP Technologies, LLC |
||||
|
|
||||
|
Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
you may not use this file except in compliance with the License. |
||||
|
You may obtain a copy of the License at |
||||
|
|
||||
|
http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
|
||||
|
Unless required by applicable law or agreed to in writing, software |
||||
|
distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
See the License for the specific language governing permissions and |
||||
|
limitations under the License. |
@ -0,0 +1,26 @@ |
|||||
|
# DTP CleanText |
||||
|
|
||||
|
We hate "hackers" and chucklefucks _this_ much. For real for real. Cap. |
||||
|
|
||||
|
## cleanText |
||||
|
|
||||
|
Performs an unzalgo and striptags on the input text. |
||||
|
|
||||
|
## Usage |
||||
|
|
||||
|
```javascript |
||||
|
var DTP = require("dtp-cleantext"); |
||||
|
const clean = DTP.cleanText(input); |
||||
|
``` |
||||
|
|
||||
|
## filterText |
||||
|
|
||||
|
A more comprehensive and complete filtering of input text that includes the filtering of nonsense, guff, HTML. Then, it performs a shoetest simplification, and removes all diacritics. It finishes with a call to `cleanText` for convenience. |
||||
|
|
||||
|
## Usage |
||||
|
|
||||
|
```javascript |
||||
|
var DTP = require("dtp-cleantext"); |
||||
|
|
||||
|
const filtered = DTP.filterText(input, options); |
||||
|
``` |
@ -0,0 +1,46 @@ |
|||||
|
// cleantext.ts
|
||||
|
// Copyright (C) DTP Technologies, LLC
|
||||
|
// License: Apache-2.0
|
||||
|
import { createRequire } from "module"; |
||||
|
const require = createRequire(import.meta.url); // jshint ignore:line
|
||||
|
import WebTextFilter from "./lib/edit-with-vi.js"; |
||||
|
import { clean } from "./unzalgo.js"; |
||||
|
import striptags from "striptags"; |
||||
|
import diacritics from "diacritics"; |
||||
|
const shoetest = require("shoetest"); |
||||
|
/** |
||||
|
* Basic text cleaning function to remove Zalgo and tags. |
||||
|
* @param text string The text to be cleaned |
||||
|
* @returns The cleaned text |
||||
|
*/ |
||||
|
export function cleanText(text) { |
||||
|
text = clean(text); |
||||
|
text = striptags(text.trim()); |
||||
|
return text; |
||||
|
} |
||||
|
/** |
||||
|
* The heavy hammer of text filtering that removes all malicious and annoying |
||||
|
* things I know about as of this writing. Zalgo, tags, shoetest, diacritics, |
||||
|
* and our own custom nonsense UTF-8 and Unicode filters. |
||||
|
* |
||||
|
* This filter is very heavy-handed and merciless. |
||||
|
* |
||||
|
* @param text string The text to be filtered |
||||
|
* @returns The filtered text |
||||
|
*/ |
||||
|
export function filterText(text) { |
||||
|
if (!text || typeof text !== "string" || text.length < 1) { |
||||
|
return text; |
||||
|
} |
||||
|
text = WebTextFilter.filterNonsense(text); |
||||
|
text = WebTextFilter.filterGuff(text); |
||||
|
text = WebTextFilter.filterHtml(text); |
||||
|
text = shoetest.simplify(text); |
||||
|
text = diacritics.remove(text); |
||||
|
/* |
||||
|
* Once all the stupidity has been stripped, strip the HTML |
||||
|
* tags that might remain. |
||||
|
*/ |
||||
|
return clean(text); |
||||
|
} |
||||
|
//# sourceMappingURL=cleantext.js.map
|
@ -0,0 +1 @@ |
|||||
|
{"version":3,"file":"cleantext.js","sourceRoot":"./src/","sources":["cleantext.ts"],"names":[],"mappings":"AAAA,eAAe;AACf,sCAAsC;AACtC,sBAAsB;AAEtB,OAAO,EAAE,aAAa,EAAE,MAAM,QAAQ,CAAC;AACvC,MAAM,OAAO,GAAG,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,qBAAqB;AAErE,OAAO,aAAa,MAAM,uBAAuB,CAAC;AAClD,OAAO,EAAE,KAAK,EAAE,MAAM,cAAc,CAAC;AAErC,OAAO,SAAS,MAAM,WAAW,CAAC;AAClC,OAAO,UAAU,MAAM,YAAY,CAAC;AAEpC,MAAM,QAAQ,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;AAErC;;;;GAIG;AACH,MAAM,UAAU,SAAS,CAAC,IAAY;IACpC,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,CAAC;IACnB,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC;IAC9B,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;;;;;;;GASG;AACH,MAAM,UAAU,UAAU,CAAC,IAAY;IACrC,IAAI,CAAC,IAAI,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACzD,OAAO,IAAI,CAAC;IACd,CAAC;IAED,IAAI,GAAG,aAAa,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;IAC1C,IAAI,GAAG,aAAa,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;IACtC,IAAI,GAAG,aAAa,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;IAEtC,IAAI,GAAG,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC/B,IAAI,GAAG,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAE/B;;;OAGG;IACH,OAAO,KAAK,CAAC,IAAI,CAAC,CAAC;AACrB,CAAC"} |
@ -0,0 +1,65 @@ |
|||||
|
// edit-with-vi.ts
|
||||
|
// Copyright (C) 2025 DTP Technologies, LLC
|
||||
|
// All Rights Reserved
|
||||
|
// Based on:
|
||||
|
// https://github.com/voidful/text-filtering-js/blob/master/text_filtering.js
|
||||
|
// - Does not extend String because stop it.
|
||||
|
// - CommonJS module
|
||||
|
'use strict'; |
||||
|
/* |
||||
|
* This file must only be edited with vi/vim. If you so much as *open* this file |
||||
|
* in VSCode, you've probably damaged the file. Do not save it. Just close it, |
||||
|
* and go edit the file with vi or vim. |
||||
|
* |
||||
|
* VS Code, being web-based, contains logic to filter out the content used to |
||||
|
* implement the filter. You will erase that content, and then various attackers |
||||
|
* will own your chat. |
||||
|
* |
||||
|
* If attackers have owned your chat, you may want to revert or otherwise restore |
||||
|
* this file to it's original state. |
||||
|
*/ |
||||
|
export function filterBBcode(text) { |
||||
|
return text.replace(/\[.*\]/g, ''); |
||||
|
} |
||||
|
export function filterLineBreak(text) { |
||||
|
return text.replace(/(\r\n|\n|\r)/gm, " "); |
||||
|
} |
||||
|
export function filterSmileysCode(text) { |
||||
|
return text |
||||
|
.replace(/:\$?.*:\$?/g, '') |
||||
|
.replace(/:\w+:?/g, '') |
||||
|
.replace(/:\w+/g, '') |
||||
|
.replace(/&#.*;/g, ''); |
||||
|
} |
||||
|
export function filterGuff(text) { |
||||
|
return text.replace('*** 作者被禁止或刪除 內容自動屏蔽 ***', ''); |
||||
|
} |
||||
|
export function filterHtml(text) { |
||||
|
return text.replace(/(<[^>]*>)/g, ' '); |
||||
|
} |
||||
|
export function filterNonsense(text) { |
||||
|
// edited to allow CR and LF
|
||||
|
// text = text.replace(/[\u0000-\u001F\u007f\u00AD\u200B-\u200D\u3000\uFEFF]/g,'');
|
||||
|
text = text.replace(/[\u0000-\u0009\u000b\u000c\u000e\u007f\u00AD\u200B-\u200D\u3000\uFEFF]/g, ''); |
||||
|
text = text.replace(/\u00AD/, ' '); |
||||
|
text = text.replace(/\u2013/, '-'); |
||||
|
return text; |
||||
|
} |
||||
|
export function filterAll(text) { |
||||
|
text = module.exports.filterSmileysCode(text); |
||||
|
text = module.exports.filterBBcode(text); |
||||
|
text = module.exports.filterGuff(text); |
||||
|
text = module.exports.filterHtml(text); |
||||
|
text = module.exports.filterLineBreak(text); |
||||
|
return text; |
||||
|
} |
||||
|
export default { |
||||
|
filterBBcode, |
||||
|
filterLineBreak, |
||||
|
filterSmileysCode, |
||||
|
filterGuff, |
||||
|
filterHtml, |
||||
|
filterNonsense, |
||||
|
filterAll, |
||||
|
}; |
||||
|
//# sourceMappingURL=edit-with-vi.js.map
|
@ -0,0 +1 @@ |
|||||
|
{"version":3,"file":"edit-with-vi.js","sourceRoot":"./src/","sources":["lib/edit-with-vi.ts"],"names":[],"mappings":"AAAA,kBAAkB;AAClB,2CAA2C;AAC3C,sBAAsB;AAEtB,YAAY;AACZ,6EAA6E;AAC7E,4CAA4C;AAC5C,oBAAoB;AAEpB,YAAY,CAAC;AAEb;;;;;;;;;;;GAWG;AAEH,MAAM,UAAU,YAAY,CAAE,IAAY;IACxC,OAAO,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;AACrC,CAAC;AAED,MAAM,UAAU,eAAe,CAAE,IAAY;IAC3C,OAAO,IAAI,CAAC,OAAO,CAAC,gBAAgB,EAAC,GAAG,CAAC,CAAC;AAC5C,CAAC;AAED,MAAM,UAAU,iBAAiB,CAAE,IAAY;IAC7C,OAAO,IAAI;SACR,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC;SAC1B,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC;SACtB,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC;SACpB,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CACvB;AACH,CAAC;AAED,MAAM,UAAU,UAAU,CAAE,IAAY;IACtC,OAAO,IAAI,CAAC,OAAO,CAAC,yBAAyB,EAAE,EAAE,CAAC,CAAC;AACrD,CAAC;AAED,MAAM,UAAU,UAAU,CAAE,IAAY;IACtC,OAAO,IAAI,CAAC,OAAO,CAAC,YAAY,EAAC,GAAG,CAAC,CAAC;AACxC,CAAC;AAED,MAAM,UAAU,cAAc,CAAE,IAAY;IAC1C,4BAA4B;IAC5B,mFAAmF;IACnF,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,yEAAyE,EAAC,EAAE,CAAC,CAAC;IAElG,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAC,GAAG,CAAC,CAAC;IAClC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAC,GAAG,CAAC,CAAC;IAClC,OAAO,IAAI,CAAC;AACd,CAAC;AAED,MAAM,UAAU,SAAS,CAAE,IAAY;IACrC,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC;IAC9C,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;IACzC,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;IACvC,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;IACvC,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC;IAC5C,OAAO,IAAI,CAAC;AACd,CAAC;AAED,eAAe;IACb,YAAY;IACZ,eAAe;IACf,iBAAiB;IACjB,UAAU;IACV,UAAU;IACV,cAAc;IACd,SAAS;CACV,CAAC"} |
@ -0,0 +1,117 @@ |
|||||
|
// unzalgo.ts
|
||||
|
// Copyright (C) DTP Technologies, LLC
|
||||
|
// License: Apache-2.0
|
||||
|
"use strict"; |
||||
|
import { percentile } from "stats-lite"; |
||||
|
const categories = /[\p{Mn}\p{Me}]+/u; |
||||
|
const DEFAULT_DETECTION_THRESHOLD = 0.55; |
||||
|
const DEFAULT_TARGET_DENSITY = 0; |
||||
|
const compose = (text) => text.normalize("NFC"); |
||||
|
const decompose = (text) => text.normalize("NFD"); |
||||
|
const computeZalgoDensity = (text) => [...text].filter((character) => categories.test(character)).length / |
||||
|
Math.max(text.length, 1); |
||||
|
const clamp = (x) => Math.max(Math.min(x, 1), 0); |
||||
|
/** |
||||
|
* Computes a score ∈ [0, 1] for every word in the input string. Each score |
||||
|
* represents the ratio of combining characters to total characters in a word. |
||||
|
* |
||||
|
* @param text string The input string for which to compute scores. |
||||
|
* @return Array<number> An array of scores where each score describes the |
||||
|
* Zalgo ratio of a word. |
||||
|
*/ |
||||
|
export function computeScores(text) { |
||||
|
const wordScores = []; |
||||
|
/** |
||||
|
* Trimming here allows us to return early. |
||||
|
* Without trimming, we risk dividing by `0` later when computing the score. |
||||
|
*/ |
||||
|
if (!text.trim().length) { |
||||
|
wordScores.push(0); |
||||
|
} |
||||
|
else { |
||||
|
for (const word of decompose(text).split(/\s+/)) { |
||||
|
let banned = 0; |
||||
|
for (const character of word) { |
||||
|
if (categories.test(character)) { |
||||
|
++banned; |
||||
|
} |
||||
|
} |
||||
|
const score = banned / word.length; |
||||
|
wordScores.push(score); |
||||
|
} |
||||
|
} |
||||
|
return wordScores; |
||||
|
} |
||||
|
/** |
||||
|
* Determines if the string consists of Zalgo text. Note that the occurrence |
||||
|
* of a combining character is not enough to trigger the detection. Instead, |
||||
|
* it computes a ratio for the input string and checks if it exceeds a given |
||||
|
* threshold. Thus, internationalized strings aren't automatically classified |
||||
|
* as Zalgo text. |
||||
|
* |
||||
|
* @param text string A string for which a Zalgo text check is run. |
||||
|
* @param detectionThreshold number A threshold ∈ [0, 1]. The higher the |
||||
|
* threshold, the more combining characters are needed for it to be detected |
||||
|
* as Zalgo text. |
||||
|
* @return boolean Whether the string is a Zalgo text string. |
||||
|
*/ |
||||
|
export function isZalgo(text, detectionThreshold = DEFAULT_DETECTION_THRESHOLD) { |
||||
|
const wordScores = computeScores(text); |
||||
|
const totalScore = percentile(wordScores, 0.75); |
||||
|
return totalScore >= clamp(detectionThreshold); |
||||
|
} |
||||
|
/** |
||||
|
* Removes all combining characters for every word in a string if the word is |
||||
|
* classified as Zalgo text. |
||||
|
* |
||||
|
* If `targetDensity` is specified, not all the Zalgo characters will be |
||||
|
* removed. Instead, they will be thinned out uniformly. |
||||
|
* |
||||
|
* @param text string |
||||
|
* A string for which combining characters are removed for every word whose |
||||
|
* Zalgo property is met. |
||||
|
* @param options object Options for cleaning. |
||||
|
* @param options.detectionThreshold number |
||||
|
* A threshold ∈ [0, 1]. The higher the threshold, the more combining |
||||
|
* characters are needed for it to be detected as Zalgo text. |
||||
|
* @param options.targetDensity number |
||||
|
* A threshold ∈ [0, 1]. The higher the density, the more Zalgo characters |
||||
|
* will be part of the resulting string. The result is guaranteed to have a |
||||
|
* Zalgo-character density that is less than or equal to the one provided. |
||||
|
* @return string |
||||
|
* A cleaned, more readable string. |
||||
|
*/ |
||||
|
export function clean(text, { detectionThreshold = DEFAULT_DETECTION_THRESHOLD, targetDensity = DEFAULT_TARGET_DENSITY, } = {}) { |
||||
|
let cleaned = ""; |
||||
|
const effectiveTargetDensity = clamp(targetDensity); |
||||
|
for (const word of decompose(text).split(/(\s+)/)) { |
||||
|
if (isZalgo(word, detectionThreshold)) { |
||||
|
let cleanedWord = ""; |
||||
|
const letters = [...word].map((character) => { |
||||
|
return { |
||||
|
character, |
||||
|
isCandidate: categories.test(character), |
||||
|
}; |
||||
|
}); |
||||
|
for (let i = 0; i < letters.length; ++i) { |
||||
|
const { character, isCandidate } = letters[i]; |
||||
|
if (isCandidate) { |
||||
|
const admissionProjection = cleanedWord + word.substring(i); |
||||
|
const omissionProjection = cleanedWord + word.substring(i + 1); |
||||
|
const admissionDistance = effectiveTargetDensity - computeZalgoDensity(admissionProjection); |
||||
|
const omissionDistance = effectiveTargetDensity - computeZalgoDensity(omissionProjection); |
||||
|
if (Math.abs(omissionDistance) <= Math.abs(admissionDistance)) { |
||||
|
continue; |
||||
|
} |
||||
|
} |
||||
|
cleanedWord += character; |
||||
|
} |
||||
|
cleaned += cleanedWord; |
||||
|
} |
||||
|
else { |
||||
|
cleaned += word; |
||||
|
} |
||||
|
} |
||||
|
return compose(cleaned); |
||||
|
} |
||||
|
//# sourceMappingURL=unzalgo.js.map
|
@ -0,0 +1 @@ |
|||||
|
{"version":3,"file":"unzalgo.js","sourceRoot":"./src/","sources":["unzalgo.ts"],"names":[],"mappings":"AAAA,aAAa;AACb,sCAAsC;AACtC,sBAAsB;AAEtB,YAAY,CAAC;AAEb,OAAO,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AAExC,MAAM,UAAU,GAAG,kBAAkB,CAAC;AACtC,MAAM,2BAA2B,GAAG,IAAI,CAAC;AACzC,MAAM,sBAAsB,GAAG,CAAC,CAAC;AACjC,MAAM,OAAO,GAAG,CAAC,IAAY,EAAE,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;AACxD,MAAM,SAAS,GAAG,CAAC,IAAY,EAAE,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;AAC1D,MAAM,mBAAmB,GAAG,CAAC,IAAY,EAAE,EAAE,CAC3C,CAAC,GAAG,IAAI,CAAC,CAAC,MAAM,CAAC,CAAC,SAAS,EAAE,EAAE,CAAC,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,MAAM;IAClE,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;AAC3B,MAAM,KAAK,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;AAEzD;;;;;;;GAOG;AACH,MAAM,UAAU,aAAa,CAAC,IAAY;IACxC,MAAM,UAAU,GAAkB,EAAE,CAAC;IACrC;;;OAGG;IACH,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,EAAE,CAAC;QACxB,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACrB,CAAC;SAAM,CAAC;QACN,KAAK,MAAM,IAAI,IAAI,SAAS,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;YAChD,IAAI,MAAM,GAAG,CAAC,CAAC;YACf,KAAK,MAAM,SAAS,IAAI,IAAI,EAAE,CAAC;gBAC7B,IAAI,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;oBAC/B,EAAE,MAAM,CAAC;gBACX,CAAC;YACH,CAAC;YACD,MAAM,KAAK,GAAG,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC;YACnC,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACzB,CAAC;IACH,CAAC;IACD,OAAO,UAAU,CAAC;AACpB,CAAC;AAED;;;;;;;;;;;;GAYG;AACH,MAAM,UAAU,OAAO,CACrB,IAAY,EACZ,qBAA6B,2BAA2B;IAExD,MAAM,UAAU,GAAG,aAAa,CAAC,IAAI,CAAC,CAAC;IACvC,MAAM,UAAU,GAAG,UAAU,CAAC,UAAU,EAAE,IAAI,CAAC,CAAC;IAChD,OAAO,UAAU,IAAI,KAAK,CAAC,kBAAkB,CAAC,CAAC;AACjD,CAAC;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,MAAM,UAAU,KAAK,CACnB,IAAY,EACZ,EACE,kBAAkB,GAAG,2BAA2B,EAChD,aAAa,GAAG,sBAAsB,GACvC,GAAG,EAAE;IAEN,IAAI,OAAO,GAAG,EAAE,CAAC;IACjB,MAAM,sBAAsB,GAAG,KAAK,CAAC,aAAa,CAAC,CAAC;IAOpD,KAAK,MAAM,IAAI,IAAI,SAAS,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,EAAE,CAAC;QAClD,IAAI,OAAO,CAAC,IAAI,EAAE,kBAAkB,CAAC,EAAE,CAAC;YACtC,IAAI,WAAW,GAAG,EAAE,CAAC;YACrB,MAAM,OAAO,GAAG,CAAC,GAAG,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,SAAS,EAAE,EAAE;gBAC1C,OAAO;oBACL,SAAS;oBACT,WAAW,EAAE,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC;iBACxC,CAAC;YACJ,CAAC,CAAC,CAAC;YACH,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,EAAE,CAAC;gBACxC,MAAM,EAAE,SAAS,EAAE,WAAW,EAAE,GAAG,OAAO,CAAC,CAAC,CAAoB,CAAC;gBACjE,IAAI,WAAW,EAAE,CAAC;oBAChB,MAAM,mBAAmB,GAAG,WAAW,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC;oBAC5D,MAAM,kBAAkB,GAAG,WAAW,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;oBAC/D,MAAM,iBAAiB,GACrB,sBAAsB,GAAG,mBAAmB,CAAC,mBAAmB,CAAC,CAAC;oBACpE,MAAM,gBAAgB,GACpB,sBAAsB,GAAG,mBAAmB,CAAC,kBAAkB,CAAC,CAAC;oBACnE,IAAI,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,IAAI,IAAI,CAAC,GAAG,CAAC,iBAAiB,CAAC,EAAE,CAAC;wBAC9D,SAAS;oBACX,CAAC;gBACH,CAAC;gBACD,WAAW,IAAI,SAAS,CAAC;YAC3B,CAAC;YACD,OAAO,IAAI,WAAW,CAAC;QACzB,CAAC;aAAM,CAAC;YACN,OAAO,IAAI,IAAI,CAAC;QAClB,CAAC;IACH,CAAC;IACD,OAAO,OAAO,CAAC,OAAO,CAAC,CAAC;AAC1B,CAAC"} |
@ -0,0 +1 @@ |
|||||
|
export { cleanText, filterText } from "./dist/cleantext.js"; |
@ -0,0 +1,38 @@ |
|||||
|
{ |
||||
|
"name": "dtp-cleantext", |
||||
|
"version": "1.0.0", |
||||
|
"description": "Text filtering and safety to make garbage people very sad when they can't hack your node.", |
||||
|
"type": "module", |
||||
|
"main": "index.js", |
||||
|
"scripts": { |
||||
|
"build": "tsc", |
||||
|
"test": "echo \"Error: no test specified\" && exit 1" |
||||
|
}, |
||||
|
"types": "./types/dtp-cleantext.d.ts", |
||||
|
"keywords": [ |
||||
|
"xss", |
||||
|
"sanitize", |
||||
|
"filter", |
||||
|
"clean" |
||||
|
], |
||||
|
"author": { |
||||
|
"name": "Rob Colbert", |
||||
|
"email": "[email protected]", |
||||
|
"url": "https://digitaltelepresence.com/" |
||||
|
}, |
||||
|
"license": "ISC", |
||||
|
"packageManager": "[email protected]", |
||||
|
"dependencies": { |
||||
|
"diacritics": "^1.3.0", |
||||
|
"shoetest": "^1.2.2", |
||||
|
"stats-lite": "^2.2.0", |
||||
|
"striptags": "^3.2.0" |
||||
|
}, |
||||
|
"devDependencies": { |
||||
|
"@types/diacritics": "^1.3.3", |
||||
|
"@types/node": "^22.13.10", |
||||
|
"@types/stats-lite": "^2.2.2", |
||||
|
"ts-node": "^10.9.2", |
||||
|
"typescript": "^5.8.2" |
||||
|
} |
||||
|
} |
@ -0,0 +1,248 @@ |
|||||
|
lockfileVersion: '9.0' |
||||
|
|
||||
|
settings: |
||||
|
autoInstallPeers: true |
||||
|
excludeLinksFromLockfile: false |
||||
|
|
||||
|
importers: |
||||
|
|
||||
|
.: |
||||
|
dependencies: |
||||
|
diacritics: |
||||
|
specifier: ^1.3.0 |
||||
|
version: 1.3.0 |
||||
|
shoetest: |
||||
|
specifier: ^1.2.2 |
||||
|
version: 1.2.2 |
||||
|
stats-lite: |
||||
|
specifier: ^2.2.0 |
||||
|
version: 2.2.0 |
||||
|
striptags: |
||||
|
specifier: ^3.2.0 |
||||
|
version: 3.2.0 |
||||
|
devDependencies: |
||||
|
'@types/diacritics': |
||||
|
specifier: ^1.3.3 |
||||
|
version: 1.3.3 |
||||
|
'@types/node': |
||||
|
specifier: ^22.13.10 |
||||
|
version: 22.13.10 |
||||
|
'@types/stats-lite': |
||||
|
specifier: ^2.2.2 |
||||
|
version: 2.2.2 |
||||
|
ts-node: |
||||
|
specifier: ^10.9.2 |
||||
|
version: 10.9.2(@types/[email protected])([email protected]) |
||||
|
typescript: |
||||
|
specifier: ^5.8.2 |
||||
|
version: 5.8.2 |
||||
|
|
||||
|
packages: |
||||
|
|
||||
|
'@cspotcode/[email protected]': |
||||
|
resolution: {integrity: sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==} |
||||
|
engines: {node: '>=12'} |
||||
|
|
||||
|
'@jridgewell/[email protected]': |
||||
|
resolution: {integrity: sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==} |
||||
|
engines: {node: '>=6.0.0'} |
||||
|
|
||||
|
'@jridgewell/[email protected]': |
||||
|
resolution: {integrity: sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==} |
||||
|
|
||||
|
'@jridgewell/[email protected]': |
||||
|
resolution: {integrity: sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==} |
||||
|
|
||||
|
'@tsconfig/[email protected]': |
||||
|
resolution: {integrity: sha512-DcRjDCujK/kCk/cUe8Xz8ZSpm8mS3mNNpta+jGCA6USEDfktlNvm1+IuZ9eTcDbNk41BHwpHHeW+N1lKCz4zOw==} |
||||
|
|
||||
|
'@tsconfig/[email protected]': |
||||
|
resolution: {integrity: sha512-cqefuRsh12pWyGsIoBKJA9luFu3mRxCA+ORZvA4ktLSzIuCUtWVxGIuXigEwO5/ywWFMZ2QEGKWvkZG1zDMTag==} |
||||
|
|
||||
|
'@tsconfig/[email protected]': |
||||
|
resolution: {integrity: sha512-ysT8mhdixWK6Hw3i1V2AeRqZ5WfXg1G43mqoYlM2nc6388Fq5jcXyr5mRsqViLx/GJYdoL0bfXD8nmF+Zn/Iow==} |
||||
|
|
||||
|
'@tsconfig/[email protected]': |
||||
|
resolution: {integrity: sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==} |
||||
|
|
||||
|
'@types/[email protected]': |
||||
|
resolution: {integrity: sha512-wt0tBItmBsOUVZ8+MCrkBMoVfH/EUZeTXwYSekVVYilZlGDYssREUR+sX72mHvl2IrbdCKgpYARXKh3awD2how==} |
||||
|
|
||||
|
'@types/[email protected]': |
||||
|
resolution: {integrity: sha512-I6LPUvlRH+O6VRUqYOcMudhaIdUVWfsjnZavnsraHvpBwaEyMN29ry+0UVJhImYL16xsscu0aske3yA+uPOWfw==} |
||||
|
|
||||
|
'@types/[email protected]': |
||||
|
resolution: {integrity: sha512-T+bzT53cbPbE0hMlCNZux1QuH6hQFNHIwRMTQCu3YPG0W7XUfeoULHl+TehJCjaxQx8cz4wlg5oQsOyG9LvZmA==} |
||||
|
|
||||
|
[email protected]: |
||||
|
resolution: {integrity: sha512-ueEepnujpqee2o5aIYnvHU6C0A42MNdsIDeqy5BydrkuC5R1ZuUFnm27EeFJGoEHJQgn3uleRvmTXaJgfXbt4g==} |
||||
|
engines: {node: '>=0.4.0'} |
||||
|
|
||||
|
[email protected]: |
||||
|
resolution: {integrity: sha512-OvQ/2pUDKmgfCg++xsTX1wGxfTaszcHVcTctW4UJB4hibJx2HXxxO5UmVgyjMa+ZDsiaf5wWLXYpRWMmBI0QHg==} |
||||
|
engines: {node: '>=0.4.0'} |
||||
|
hasBin: true |
||||
|
|
||||
|
[email protected]: |
||||
|
resolution: {integrity: sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==} |
||||
|
|
||||
|
[email protected]: |
||||
|
resolution: {integrity: sha512-dcKFX3jn0MpIaXjisoRvexIJVEKzaq7z2rZKxf+MSr9TkdmHmsU4m2lcLojrj/FHl8mk5VxMmYA+ftRkP/3oKQ==} |
||||
|
|
||||
|
[email protected]: |
||||
|
resolution: {integrity: sha512-wlwEkqcsaxvPJML+rDh/2iS824jbREk6DUMUKkEaSlxdYHeS43cClJtsWglvw2RfeXGm6ohKDqsXteJ5sP5enA==} |
||||
|
|
||||
|
[email protected]: |
||||
|
resolution: {integrity: sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A==} |
||||
|
engines: {node: '>=0.3.1'} |
||||
|
|
||||
|
[email protected]: |
||||
|
resolution: {integrity: sha512-pYxfDYpued//QpnLIm4Avk7rsNtAtQkUES2cwAYSvD/wd2pKD71gN2Ebj3e7klzXwjocvE8c5vx/1fxwpqmSxA==} |
||||
|
engines: {node: '>=4'} |
||||
|
|
||||
|
[email protected]: |
||||
|
resolution: {integrity: sha512-JLiSz/zsZcGFXPrB4I/AGBvtStkt+8QmksyZBZnVXnnK9XdTEyz0tX8CRYljtwYDuIuZzih6DpHQdi+3Q6zHPw==} |
||||
|
|
||||
|
[email protected]: |
||||
|
resolution: {integrity: sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw==} |
||||
|
|
||||
|
[email protected]: |
||||
|
resolution: {integrity: sha512-U+5l2KrcMNOUPYvazA3h5ekF80FHTUG+87SEAmHZmolh1M+i/WyTCxVzmi+tidIa1tM4BSe8g2Y/D3loWDjj+w==} |
||||
|
engines: {node: '>=4'} |
||||
|
|
||||
|
[email protected]: |
||||
|
resolution: {integrity: sha512-M0b3YWQs7R3Z917WRQy1HHA7Ba7D8hvZg6UE5mLykJxQVE2ju0IXbGlaHPPlkY+WN7wFP+wUMXmBFA0aV6vYGQ==} |
||||
|
engines: {node: '>=4'} |
||||
|
|
||||
|
[email protected]: |
||||
|
resolution: {integrity: sha512-iT8kIEFcGfUwo53VUFckm+glTkc0oLycRe+YqU/W4wQuIHGIWc5KMIpDnJVdavKCyEZKQTi8IDq27rDmB09QjA==} |
||||
|
|
||||
|
[email protected]: |
||||
|
resolution: {integrity: sha512-/Kz55rgUIv2KP2MKphwYT/NCuSfAlbbMRv2ZWw7wyXayu230zdtzhxxuXXcvsc6EmmhS8bSJl3uS1wmMHFumbA==} |
||||
|
engines: {node: '>=2.0.0'} |
||||
|
|
||||
|
[email protected]: |
||||
|
resolution: {integrity: sha512-g45ZOGzHDMe2bdYMdIvdAfCQkCTDMGBazSw1ypMowwGIee7ZQ5dU0rBJ8Jqgl+jAKIv4dbeE1jscZq9wid1Tkw==} |
||||
|
|
||||
|
[email protected]: |
||||
|
resolution: {integrity: sha512-f0FFpIdcHgn8zcPSbf1dRevwt047YMnaiJM3u2w2RewrB+fob/zePZcrOyQoLMMO7aBIddLcQIEK5dYjkLnGrQ==} |
||||
|
hasBin: true |
||||
|
peerDependencies: |
||||
|
'@swc/core': '>=1.2.50' |
||||
|
'@swc/wasm': '>=1.2.50' |
||||
|
'@types/node': '*' |
||||
|
typescript: '>=2.7' |
||||
|
peerDependenciesMeta: |
||||
|
'@swc/core': |
||||
|
optional: true |
||||
|
'@swc/wasm': |
||||
|
optional: true |
||||
|
|
||||
|
[email protected]: |
||||
|
resolution: {integrity: sha512-aJn6wq13/afZp/jT9QZmwEjDqqvSGp1VT5GVg+f/t6/oVyrgXM6BY1h9BRh/O5p3PlUPAe+WuiEZOmb/49RqoQ==} |
||||
|
engines: {node: '>=14.17'} |
||||
|
hasBin: true |
||||
|
|
||||
|
[email protected]: |
||||
|
resolution: {integrity: sha512-Ny6QZ2Nju20vw1SRHe3d9jVu6gJ+4e3+MMpqu7pqE5HT6WsTSlce++GQmK5UXS8mzV8DSYHrQH+Xrf2jVcuKNg==} |
||||
|
|
||||
|
[email protected]: |
||||
|
resolution: {integrity: sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==} |
||||
|
|
||||
|
[email protected]: |
||||
|
resolution: {integrity: sha512-Ux4ygGWsu2c7isFWe8Yu1YluJmqVhxqK2cLXNQA5AcC3QfbGNpM7fu0Y8b/z16pXLnFxZYvWhd3fhBY9DLmC6Q==} |
||||
|
engines: {node: '>=6'} |
||||
|
|
||||
|
snapshots: |
||||
|
|
||||
|
'@cspotcode/[email protected]': |
||||
|
dependencies: |
||||
|
'@jridgewell/trace-mapping': 0.3.9 |
||||
|
|
||||
|
'@jridgewell/[email protected]': {} |
||||
|
|
||||
|
'@jridgewell/[email protected]': {} |
||||
|
|
||||
|
'@jridgewell/[email protected]': |
||||
|
dependencies: |
||||
|
'@jridgewell/resolve-uri': 3.1.2 |
||||
|
'@jridgewell/sourcemap-codec': 1.5.0 |
||||
|
|
||||
|
'@tsconfig/[email protected]': {} |
||||
|
|
||||
|
'@tsconfig/[email protected]': {} |
||||
|
|
||||
|
'@tsconfig/[email protected]': {} |
||||
|
|
||||
|
'@tsconfig/[email protected]': {} |
||||
|
|
||||
|
'@types/[email protected]': {} |
||||
|
|
||||
|
'@types/[email protected]': |
||||
|
dependencies: |
||||
|
undici-types: 6.20.0 |
||||
|
|
||||
|
'@types/[email protected]': {} |
||||
|
|
||||
|
[email protected]: |
||||
|
dependencies: |
||||
|
acorn: 8.14.1 |
||||
|
|
||||
|
[email protected]: {} |
||||
|
|
||||
|
[email protected]: {} |
||||
|
|
||||
|
[email protected]: {} |
||||
|
|
||||
|
[email protected]: {} |
||||
|
|
||||
|
[email protected]: {} |
||||
|
|
||||
|
[email protected]: {} |
||||
|
|
||||
|
[email protected]: {} |
||||
|
|
||||
|
[email protected]: {} |
||||
|
|
||||
|
[email protected]: |
||||
|
dependencies: |
||||
|
drange: 1.1.1 |
||||
|
ret: 0.2.2 |
||||
|
|
||||
|
[email protected]: {} |
||||
|
|
||||
|
[email protected]: |
||||
|
dependencies: |
||||
|
randexp: 0.5.3 |
||||
|
|
||||
|
[email protected]: |
||||
|
dependencies: |
||||
|
isnumber: 1.0.0 |
||||
|
|
||||
|
[email protected]: {} |
||||
|
|
||||
|
[email protected](@types/[email protected])([email protected]): |
||||
|
dependencies: |
||||
|
'@cspotcode/source-map-support': 0.8.1 |
||||
|
'@tsconfig/node10': 1.0.11 |
||||
|
'@tsconfig/node12': 1.0.11 |
||||
|
'@tsconfig/node14': 1.0.3 |
||||
|
'@tsconfig/node16': 1.0.4 |
||||
|
'@types/node': 22.13.10 |
||||
|
acorn: 8.14.1 |
||||
|
acorn-walk: 8.3.4 |
||||
|
arg: 4.1.3 |
||||
|
create-require: 1.1.1 |
||||
|
diff: 4.0.2 |
||||
|
make-error: 1.3.6 |
||||
|
typescript: 5.8.2 |
||||
|
v8-compile-cache-lib: 3.0.1 |
||||
|
yn: 3.1.1 |
||||
|
|
||||
|
[email protected]: {} |
||||
|
|
||||
|
[email protected]: {} |
||||
|
|
||||
|
[email protected]: {} |
||||
|
|
||||
|
[email protected]: {} |
@ -0,0 +1,54 @@ |
|||||
|
// cleantext.ts
|
||||
|
// Copyright (C) DTP Technologies, LLC
|
||||
|
// License: Apache-2.0
|
||||
|
|
||||
|
import { createRequire } from "module"; |
||||
|
const require = createRequire(import.meta.url); // jshint ignore:line
|
||||
|
|
||||
|
import WebTextFilter from "./lib/edit-with-vi.js"; |
||||
|
import { clean } from "./unzalgo.js"; |
||||
|
|
||||
|
import striptags from "striptags"; |
||||
|
import diacritics from "diacritics"; |
||||
|
|
||||
|
const shoetest = require("shoetest"); |
||||
|
|
||||
|
/** |
||||
|
* Basic text cleaning function to remove Zalgo and tags. |
||||
|
* @param text string The text to be cleaned |
||||
|
* @returns The cleaned text |
||||
|
*/ |
||||
|
export function cleanText(text: string): string { |
||||
|
text = clean(text); |
||||
|
text = striptags(text.trim()); |
||||
|
return text; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* The heavy hammer of text filtering that removes all malicious and annoying |
||||
|
* things I know about as of this writing. Zalgo, tags, shoetest, diacritics, |
||||
|
* and our own custom nonsense UTF-8 and Unicode filters. |
||||
|
* |
||||
|
* This filter is very heavy-handed and merciless. |
||||
|
* |
||||
|
* @param text string The text to be filtered |
||||
|
* @returns The filtered text |
||||
|
*/ |
||||
|
export function filterText(text: string): string { |
||||
|
if (!text || typeof text !== "string" || text.length < 1) { |
||||
|
return text; |
||||
|
} |
||||
|
|
||||
|
text = WebTextFilter.filterNonsense(text); |
||||
|
text = WebTextFilter.filterGuff(text); |
||||
|
text = WebTextFilter.filterHtml(text); |
||||
|
|
||||
|
text = shoetest.simplify(text); |
||||
|
text = diacritics.remove(text); |
||||
|
|
||||
|
/* |
||||
|
* Once all the stupidity has been stripped, strip the HTML |
||||
|
* tags that might remain. |
||||
|
*/ |
||||
|
return clean(text); |
||||
|
} |
@ -0,0 +1,77 @@ |
|||||
|
// edit-with-vi.ts
|
||||
|
// Copyright (C) 2025 DTP Technologies, LLC
|
||||
|
// All Rights Reserved
|
||||
|
|
||||
|
// Based on:
|
||||
|
// https://github.com/voidful/text-filtering-js/blob/master/text_filtering.js
|
||||
|
// - Does not extend String because stop it.
|
||||
|
// - CommonJS module
|
||||
|
|
||||
|
'use strict'; |
||||
|
|
||||
|
/* |
||||
|
* This file must only be edited with vi/vim. If you so much as *open* this file |
||||
|
* in VSCode, you've probably damaged the file. Do not save it. Just close it, |
||||
|
* and go edit the file with vi or vim. |
||||
|
* |
||||
|
* VS Code, being web-based, contains logic to filter out the content used to |
||||
|
* implement the filter. You will erase that content, and then various attackers |
||||
|
* will own your chat. |
||||
|
* |
||||
|
* If attackers have owned your chat, you may want to revert or otherwise restore |
||||
|
* this file to it's original state. |
||||
|
*/ |
||||
|
|
||||
|
export function filterBBcode (text: string) : string { |
||||
|
return text.replace(/\[.*\]/g, ''); |
||||
|
} |
||||
|
|
||||
|
export function filterLineBreak (text: string) : string { |
||||
|
return text.replace(/(\r\n|\n|\r)/gm," "); |
||||
|
} |
||||
|
|
||||
|
export function filterSmileysCode (text: string) : string { |
||||
|
return text |
||||
|
.replace(/:\$?.*:\$?/g, '') |
||||
|
.replace(/:\w+:?/g, '') |
||||
|
.replace(/:\w+/g, '') |
||||
|
.replace(/&#.*;/g, '') |
||||
|
; |
||||
|
} |
||||
|
|
||||
|
export function filterGuff (text: string) : string { |
||||
|
return text.replace('*** 作者被禁止或刪除 內容自動屏蔽 ***', ''); |
||||
|
} |
||||
|
|
||||
|
export function filterHtml (text: string) : string { |
||||
|
return text.replace(/(<[^>]*>)/g,' '); |
||||
|
} |
||||
|
|
||||
|
export function filterNonsense (text: string) : string { |
||||
|
// edited to allow CR and LF
|
||||
|
// text = text.replace(/[\u0000-\u001F\u007f\u00AD\u200B-\u200D\u3000\uFEFF]/g,'');
|
||||
|
text = text.replace(/[\u0000-\u0009\u000b\u000c\u000e\u007f\u00AD\u200B-\u200D\u3000\uFEFF]/g,''); |
||||
|
|
||||
|
text = text.replace(/\u00AD/,' '); |
||||
|
text = text.replace(/\u2013/,'-'); |
||||
|
return text; |
||||
|
} |
||||
|
|
||||
|
export function filterAll (text: string) : string { |
||||
|
text = module.exports.filterSmileysCode(text); |
||||
|
text = module.exports.filterBBcode(text); |
||||
|
text = module.exports.filterGuff(text); |
||||
|
text = module.exports.filterHtml(text); |
||||
|
text = module.exports.filterLineBreak(text); |
||||
|
return text; |
||||
|
} |
||||
|
|
||||
|
export default { |
||||
|
filterBBcode, |
||||
|
filterLineBreak, |
||||
|
filterSmileysCode, |
||||
|
filterGuff, |
||||
|
filterHtml, |
||||
|
filterNonsense, |
||||
|
filterAll, |
||||
|
}; |
@ -0,0 +1,138 @@ |
|||||
|
// unzalgo.ts
|
||||
|
// Copyright (C) DTP Technologies, LLC
|
||||
|
// License: Apache-2.0
|
||||
|
|
||||
|
"use strict"; |
||||
|
|
||||
|
import { percentile } from "stats-lite"; |
||||
|
|
||||
|
const categories = /[\p{Mn}\p{Me}]+/u; |
||||
|
const DEFAULT_DETECTION_THRESHOLD = 0.55; |
||||
|
const DEFAULT_TARGET_DENSITY = 0; |
||||
|
const compose = (text: string) => text.normalize("NFC"); |
||||
|
const decompose = (text: string) => text.normalize("NFD"); |
||||
|
const computeZalgoDensity = (text: string) => |
||||
|
[...text].filter((character) => categories.test(character)).length / |
||||
|
Math.max(text.length, 1); |
||||
|
const clamp = (x: number) => Math.max(Math.min(x, 1), 0); |
||||
|
|
||||
|
/** |
||||
|
* Computes a score ∈ [0, 1] for every word in the input string. Each score |
||||
|
* represents the ratio of combining characters to total characters in a word. |
||||
|
* |
||||
|
* @param text string The input string for which to compute scores. |
||||
|
* @return Array<number> An array of scores where each score describes the |
||||
|
* Zalgo ratio of a word. |
||||
|
*/ |
||||
|
export function computeScores(text: string): Array<number> { |
||||
|
const wordScores: Array<number> = []; |
||||
|
/** |
||||
|
* Trimming here allows us to return early. |
||||
|
* Without trimming, we risk dividing by `0` later when computing the score. |
||||
|
*/ |
||||
|
if (!text.trim().length) { |
||||
|
wordScores.push(0); |
||||
|
} else { |
||||
|
for (const word of decompose(text).split(/\s+/)) { |
||||
|
let banned = 0; |
||||
|
for (const character of word) { |
||||
|
if (categories.test(character)) { |
||||
|
++banned; |
||||
|
} |
||||
|
} |
||||
|
const score = banned / word.length; |
||||
|
wordScores.push(score); |
||||
|
} |
||||
|
} |
||||
|
return wordScores; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* Determines if the string consists of Zalgo text. Note that the occurrence |
||||
|
* of a combining character is not enough to trigger the detection. Instead, |
||||
|
* it computes a ratio for the input string and checks if it exceeds a given |
||||
|
* threshold. Thus, internationalized strings aren't automatically classified |
||||
|
* as Zalgo text. |
||||
|
* |
||||
|
* @param text string A string for which a Zalgo text check is run. |
||||
|
* @param detectionThreshold number A threshold ∈ [0, 1]. The higher the |
||||
|
* threshold, the more combining characters are needed for it to be detected |
||||
|
* as Zalgo text. |
||||
|
* @return boolean Whether the string is a Zalgo text string. |
||||
|
*/ |
||||
|
export function isZalgo( |
||||
|
text: string, |
||||
|
detectionThreshold: number = DEFAULT_DETECTION_THRESHOLD |
||||
|
): boolean { |
||||
|
const wordScores = computeScores(text); |
||||
|
const totalScore = percentile(wordScores, 0.75); |
||||
|
return totalScore >= clamp(detectionThreshold); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* Removes all combining characters for every word in a string if the word is |
||||
|
* classified as Zalgo text. |
||||
|
* |
||||
|
* If `targetDensity` is specified, not all the Zalgo characters will be |
||||
|
* removed. Instead, they will be thinned out uniformly. |
||||
|
* |
||||
|
* @param text string |
||||
|
* A string for which combining characters are removed for every word whose |
||||
|
* Zalgo property is met. |
||||
|
* @param options object Options for cleaning. |
||||
|
* @param options.detectionThreshold number |
||||
|
* A threshold ∈ [0, 1]. The higher the threshold, the more combining |
||||
|
* characters are needed for it to be detected as Zalgo text. |
||||
|
* @param options.targetDensity number |
||||
|
* A threshold ∈ [0, 1]. The higher the density, the more Zalgo characters |
||||
|
* will be part of the resulting string. The result is guaranteed to have a |
||||
|
* Zalgo-character density that is less than or equal to the one provided. |
||||
|
* @return string |
||||
|
* A cleaned, more readable string. |
||||
|
*/ |
||||
|
export function clean( |
||||
|
text: string, |
||||
|
{ |
||||
|
detectionThreshold = DEFAULT_DETECTION_THRESHOLD, |
||||
|
targetDensity = DEFAULT_TARGET_DENSITY, |
||||
|
} = {} |
||||
|
) { |
||||
|
let cleaned = ""; |
||||
|
const effectiveTargetDensity = clamp(targetDensity); |
||||
|
|
||||
|
type CharacterRecord = { |
||||
|
character: string; |
||||
|
isCandidate: boolean; |
||||
|
}; |
||||
|
|
||||
|
for (const word of decompose(text).split(/(\s+)/)) { |
||||
|
if (isZalgo(word, detectionThreshold)) { |
||||
|
let cleanedWord = ""; |
||||
|
const letters = [...word].map((character) => { |
||||
|
return { |
||||
|
character, |
||||
|
isCandidate: categories.test(character), |
||||
|
}; |
||||
|
}); |
||||
|
for (let i = 0; i < letters.length; ++i) { |
||||
|
const { character, isCandidate } = letters[i] as CharacterRecord; |
||||
|
if (isCandidate) { |
||||
|
const admissionProjection = cleanedWord + word.substring(i); |
||||
|
const omissionProjection = cleanedWord + word.substring(i + 1); |
||||
|
const admissionDistance = |
||||
|
effectiveTargetDensity - computeZalgoDensity(admissionProjection); |
||||
|
const omissionDistance = |
||||
|
effectiveTargetDensity - computeZalgoDensity(omissionProjection); |
||||
|
if (Math.abs(omissionDistance) <= Math.abs(admissionDistance)) { |
||||
|
continue; |
||||
|
} |
||||
|
} |
||||
|
cleanedWord += character; |
||||
|
} |
||||
|
cleaned += cleanedWord; |
||||
|
} else { |
||||
|
cleaned += word; |
||||
|
} |
||||
|
} |
||||
|
return compose(cleaned); |
||||
|
} |
@ -0,0 +1,37 @@ |
|||||
|
{ |
||||
|
"compilerOptions": { |
||||
|
"target": "ES2022", |
||||
|
"lib": ["es2022", "dom"], |
||||
|
"experimentalDecorators": true, |
||||
|
"emitDecoratorMetadata": true, |
||||
|
"module": "ESNext", |
||||
|
"rootDir": "./src", |
||||
|
"moduleResolution": "node", |
||||
|
"baseUrl": "./src", |
||||
|
"typeRoots": ["node_modules/@types", "types"], |
||||
|
"allowImportingTsExtensions": true, |
||||
|
"rewriteRelativeImportExtensions": true, |
||||
|
"resolveJsonModule": true, |
||||
|
"declaration": false, |
||||
|
"sourceMap": true, |
||||
|
"outDir": "dist", |
||||
|
"removeComments": false, |
||||
|
"importHelpers": true, |
||||
|
"sourceRoot": "./src", |
||||
|
"esModuleInterop": true, |
||||
|
"forceConsistentCasingInFileNames": true, |
||||
|
"strict": true, |
||||
|
"alwaysStrict": true, |
||||
|
"noUnusedLocals": true, |
||||
|
"noUnusedParameters": true, |
||||
|
"noImplicitReturns": true, |
||||
|
"noFallthroughCasesInSwitch": true, |
||||
|
"noUncheckedIndexedAccess": true, |
||||
|
"pretty": true, |
||||
|
"skipDefaultLibCheck": true, |
||||
|
"skipLibCheck": true |
||||
|
}, |
||||
|
"include": ["src/**/*.ts"], |
||||
|
"exclude": ["node_modules", "docs"], |
||||
|
"files": ["types/dtp-cleantext.d.ts"] |
||||
|
} |
@ -0,0 +1,4 @@ |
|||||
|
declare module "dtp-cleantext" { |
||||
|
export function cleanText(text: string): string; |
||||
|
export function filterText(text: string): string; |
||||
|
} |
Loading…
Reference in new issue