17 changed files with 868 additions and 0 deletions
@ -0,0 +1 @@ |
|||
node_modules |
@ -0,0 +1,13 @@ |
|||
Copyright 2025 DTP Technologies, LLC |
|||
|
|||
Licensed under the Apache License, Version 2.0 (the "License"); |
|||
you may not use this file except in compliance with the License. |
|||
You may obtain a copy of the License at |
|||
|
|||
http://www.apache.org/licenses/LICENSE-2.0 |
|||
|
|||
Unless required by applicable law or agreed to in writing, software |
|||
distributed under the License is distributed on an "AS IS" BASIS, |
|||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
See the License for the specific language governing permissions and |
|||
limitations under the License. |
@ -0,0 +1,26 @@ |
|||
# DTP CleanText |
|||
|
|||
We hate "hackers" and chucklefucks _this_ much. For real for real. Cap. |
|||
|
|||
## cleanText |
|||
|
|||
Performs an unzalgo and striptags on the input text. |
|||
|
|||
## Usage |
|||
|
|||
```javascript |
|||
var DTP = require("dtp-cleantext"); |
|||
const clean = DTP.cleanText(input); |
|||
``` |
|||
|
|||
## filterText |
|||
|
|||
A more comprehensive and complete filtering of input text that includes the filtering of nonsense, guff, HTML. Then, it performs a shoetest simplification, and removes all diacritics. It finishes with a call to `cleanText` for convenience. |
|||
|
|||
## Usage |
|||
|
|||
```javascript |
|||
var DTP = require("dtp-cleantext"); |
|||
|
|||
const filtered = DTP.filterText(input, options); |
|||
``` |
@ -0,0 +1,46 @@ |
|||
// cleantext.ts
|
|||
// Copyright (C) DTP Technologies, LLC
|
|||
// License: Apache-2.0
|
|||
import { createRequire } from "module"; |
|||
const require = createRequire(import.meta.url); // jshint ignore:line
|
|||
import WebTextFilter from "./lib/edit-with-vi.js"; |
|||
import { clean } from "./unzalgo.js"; |
|||
import striptags from "striptags"; |
|||
import diacritics from "diacritics"; |
|||
const shoetest = require("shoetest"); |
|||
/** |
|||
* Basic text cleaning function to remove Zalgo and tags. |
|||
* @param text string The text to be cleaned |
|||
* @returns The cleaned text |
|||
*/ |
|||
export function cleanText(text) { |
|||
text = clean(text); |
|||
text = striptags(text.trim()); |
|||
return text; |
|||
} |
|||
/** |
|||
* The heavy hammer of text filtering that removes all malicious and annoying |
|||
* things I know about as of this writing. Zalgo, tags, shoetest, diacritics, |
|||
* and our own custom nonsense UTF-8 and Unicode filters. |
|||
* |
|||
* This filter is very heavy-handed and merciless. |
|||
* |
|||
* @param text string The text to be filtered |
|||
* @returns The filtered text |
|||
*/ |
|||
export function filterText(text) { |
|||
if (!text || typeof text !== "string" || text.length < 1) { |
|||
return text; |
|||
} |
|||
text = WebTextFilter.filterNonsense(text); |
|||
text = WebTextFilter.filterGuff(text); |
|||
text = WebTextFilter.filterHtml(text); |
|||
text = shoetest.simplify(text); |
|||
text = diacritics.remove(text); |
|||
/* |
|||
* Once all the stupidity has been stripped, strip the HTML |
|||
* tags that might remain. |
|||
*/ |
|||
return clean(text); |
|||
} |
|||
//# sourceMappingURL=cleantext.js.map
|
@ -0,0 +1 @@ |
|||
{"version":3,"file":"cleantext.js","sourceRoot":"./src/","sources":["cleantext.ts"],"names":[],"mappings":"AAAA,eAAe;AACf,sCAAsC;AACtC,sBAAsB;AAEtB,OAAO,EAAE,aAAa,EAAE,MAAM,QAAQ,CAAC;AACvC,MAAM,OAAO,GAAG,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,qBAAqB;AAErE,OAAO,aAAa,MAAM,uBAAuB,CAAC;AAClD,OAAO,EAAE,KAAK,EAAE,MAAM,cAAc,CAAC;AAErC,OAAO,SAAS,MAAM,WAAW,CAAC;AAClC,OAAO,UAAU,MAAM,YAAY,CAAC;AAEpC,MAAM,QAAQ,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;AAErC;;;;GAIG;AACH,MAAM,UAAU,SAAS,CAAC,IAAY;IACpC,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,CAAC;IACnB,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC;IAC9B,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;;;;;;;GASG;AACH,MAAM,UAAU,UAAU,CAAC,IAAY;IACrC,IAAI,CAAC,IAAI,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACzD,OAAO,IAAI,CAAC;IACd,CAAC;IAED,IAAI,GAAG,aAAa,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;IAC1C,IAAI,GAAG,aAAa,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;IACtC,IAAI,GAAG,aAAa,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;IAEtC,IAAI,GAAG,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC/B,IAAI,GAAG,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAE/B;;;OAGG;IACH,OAAO,KAAK,CAAC,IAAI,CAAC,CAAC;AACrB,CAAC"} |
@ -0,0 +1,65 @@ |
|||
// edit-with-vi.ts
|
|||
// Copyright (C) 2025 DTP Technologies, LLC
|
|||
// All Rights Reserved
|
|||
// Based on:
|
|||
// https://github.com/voidful/text-filtering-js/blob/master/text_filtering.js
|
|||
// - Does not extend String because stop it.
|
|||
// - CommonJS module
|
|||
'use strict'; |
|||
/* |
|||
* This file must only be edited with vi/vim. If you so much as *open* this file |
|||
* in VSCode, you've probably damaged the file. Do not save it. Just close it, |
|||
* and go edit the file with vi or vim. |
|||
* |
|||
* VS Code, being web-based, contains logic to filter out the content used to |
|||
* implement the filter. You will erase that content, and then various attackers |
|||
* will own your chat. |
|||
* |
|||
* If attackers have owned your chat, you may want to revert or otherwise restore |
|||
* this file to it's original state. |
|||
*/ |
|||
export function filterBBcode(text) { |
|||
return text.replace(/\[.*\]/g, ''); |
|||
} |
|||
export function filterLineBreak(text) { |
|||
return text.replace(/(\r\n|\n|\r)/gm, " "); |
|||
} |
|||
export function filterSmileysCode(text) { |
|||
return text |
|||
.replace(/:\$?.*:\$?/g, '') |
|||
.replace(/:\w+:?/g, '') |
|||
.replace(/:\w+/g, '') |
|||
.replace(/&#.*;/g, ''); |
|||
} |
|||
export function filterGuff(text) { |
|||
return text.replace('*** 作者被禁止或刪除 內容自動屏蔽 ***', ''); |
|||
} |
|||
export function filterHtml(text) { |
|||
return text.replace(/(<[^>]*>)/g, ' '); |
|||
} |
|||
export function filterNonsense(text) { |
|||
// edited to allow CR and LF
|
|||
// text = text.replace(/[\u0000-\u001F\u007f\u00AD\u200B-\u200D\u3000\uFEFF]/g,'');
|
|||
text = text.replace(/[\u0000-\u0009\u000b\u000c\u000e\u007f\u00AD\u200B-\u200D\u3000\uFEFF]/g, ''); |
|||
text = text.replace(/\u00AD/, ' '); |
|||
text = text.replace(/\u2013/, '-'); |
|||
return text; |
|||
} |
|||
export function filterAll(text) { |
|||
text = module.exports.filterSmileysCode(text); |
|||
text = module.exports.filterBBcode(text); |
|||
text = module.exports.filterGuff(text); |
|||
text = module.exports.filterHtml(text); |
|||
text = module.exports.filterLineBreak(text); |
|||
return text; |
|||
} |
|||
export default { |
|||
filterBBcode, |
|||
filterLineBreak, |
|||
filterSmileysCode, |
|||
filterGuff, |
|||
filterHtml, |
|||
filterNonsense, |
|||
filterAll, |
|||
}; |
|||
//# sourceMappingURL=edit-with-vi.js.map
|
@ -0,0 +1 @@ |
|||
{"version":3,"file":"edit-with-vi.js","sourceRoot":"./src/","sources":["lib/edit-with-vi.ts"],"names":[],"mappings":"AAAA,kBAAkB;AAClB,2CAA2C;AAC3C,sBAAsB;AAEtB,YAAY;AACZ,6EAA6E;AAC7E,4CAA4C;AAC5C,oBAAoB;AAEpB,YAAY,CAAC;AAEb;;;;;;;;;;;GAWG;AAEH,MAAM,UAAU,YAAY,CAAE,IAAY;IACxC,OAAO,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;AACrC,CAAC;AAED,MAAM,UAAU,eAAe,CAAE,IAAY;IAC3C,OAAO,IAAI,CAAC,OAAO,CAAC,gBAAgB,EAAC,GAAG,CAAC,CAAC;AAC5C,CAAC;AAED,MAAM,UAAU,iBAAiB,CAAE,IAAY;IAC7C,OAAO,IAAI;SACR,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC;SAC1B,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC;SACtB,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC;SACpB,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CACvB;AACH,CAAC;AAED,MAAM,UAAU,UAAU,CAAE,IAAY;IACtC,OAAO,IAAI,CAAC,OAAO,CAAC,yBAAyB,EAAE,EAAE,CAAC,CAAC;AACrD,CAAC;AAED,MAAM,UAAU,UAAU,CAAE,IAAY;IACtC,OAAO,IAAI,CAAC,OAAO,CAAC,YAAY,EAAC,GAAG,CAAC,CAAC;AACxC,CAAC;AAED,MAAM,UAAU,cAAc,CAAE,IAAY;IAC1C,4BAA4B;IAC5B,mFAAmF;IACnF,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,yEAAyE,EAAC,EAAE,CAAC,CAAC;IAElG,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAC,GAAG,CAAC,CAAC;IAClC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAC,GAAG,CAAC,CAAC;IAClC,OAAO,IAAI,CAAC;AACd,CAAC;AAED,MAAM,UAAU,SAAS,CAAE,IAAY;IACrC,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC;IAC9C,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;IACzC,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;IACvC,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;IACvC,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC;IAC5C,OAAO,IAAI,CAAC;AACd,CAAC;AAED,eAAe;IACb,YAAY;IACZ,eAAe;IACf,iBAAiB;IACjB,UAAU;IACV,UAAU;IACV,cAAc;IACd,SAAS;CACV,CAAC"} |
@ -0,0 +1,117 @@ |
|||
// unzalgo.ts
|
|||
// Copyright (C) DTP Technologies, LLC
|
|||
// License: Apache-2.0
|
|||
"use strict"; |
|||
import { percentile } from "stats-lite"; |
|||
const categories = /[\p{Mn}\p{Me}]+/u; |
|||
const DEFAULT_DETECTION_THRESHOLD = 0.55; |
|||
const DEFAULT_TARGET_DENSITY = 0; |
|||
const compose = (text) => text.normalize("NFC"); |
|||
const decompose = (text) => text.normalize("NFD"); |
|||
const computeZalgoDensity = (text) => [...text].filter((character) => categories.test(character)).length / |
|||
Math.max(text.length, 1); |
|||
const clamp = (x) => Math.max(Math.min(x, 1), 0); |
|||
/** |
|||
* Computes a score ∈ [0, 1] for every word in the input string. Each score |
|||
* represents the ratio of combining characters to total characters in a word. |
|||
* |
|||
* @param text string The input string for which to compute scores. |
|||
* @return Array<number> An array of scores where each score describes the |
|||
* Zalgo ratio of a word. |
|||
*/ |
|||
export function computeScores(text) { |
|||
const wordScores = []; |
|||
/** |
|||
* Trimming here allows us to return early. |
|||
* Without trimming, we risk dividing by `0` later when computing the score. |
|||
*/ |
|||
if (!text.trim().length) { |
|||
wordScores.push(0); |
|||
} |
|||
else { |
|||
for (const word of decompose(text).split(/\s+/)) { |
|||
let banned = 0; |
|||
for (const character of word) { |
|||
if (categories.test(character)) { |
|||
++banned; |
|||
} |
|||
} |
|||
const score = banned / word.length; |
|||
wordScores.push(score); |
|||
} |
|||
} |
|||
return wordScores; |
|||
} |
|||
/** |
|||
* Determines if the string consists of Zalgo text. Note that the occurrence |
|||
* of a combining character is not enough to trigger the detection. Instead, |
|||
* it computes a ratio for the input string and checks if it exceeds a given |
|||
* threshold. Thus, internationalized strings aren't automatically classified |
|||
* as Zalgo text. |
|||
* |
|||
* @param text string A string for which a Zalgo text check is run. |
|||
* @param detectionThreshold number A threshold ∈ [0, 1]. The higher the |
|||
* threshold, the more combining characters are needed for it to be detected |
|||
* as Zalgo text. |
|||
* @return boolean Whether the string is a Zalgo text string. |
|||
*/ |
|||
export function isZalgo(text, detectionThreshold = DEFAULT_DETECTION_THRESHOLD) { |
|||
const wordScores = computeScores(text); |
|||
const totalScore = percentile(wordScores, 0.75); |
|||
return totalScore >= clamp(detectionThreshold); |
|||
} |
|||
/** |
|||
* Removes all combining characters for every word in a string if the word is |
|||
* classified as Zalgo text. |
|||
* |
|||
* If `targetDensity` is specified, not all the Zalgo characters will be |
|||
* removed. Instead, they will be thinned out uniformly. |
|||
* |
|||
* @param text string |
|||
* A string for which combining characters are removed for every word whose |
|||
* Zalgo property is met. |
|||
* @param options object Options for cleaning. |
|||
* @param options.detectionThreshold number |
|||
* A threshold ∈ [0, 1]. The higher the threshold, the more combining |
|||
* characters are needed for it to be detected as Zalgo text. |
|||
* @param options.targetDensity number |
|||
* A threshold ∈ [0, 1]. The higher the density, the more Zalgo characters |
|||
* will be part of the resulting string. The result is guaranteed to have a |
|||
* Zalgo-character density that is less than or equal to the one provided. |
|||
* @return string |
|||
* A cleaned, more readable string. |
|||
*/ |
|||
export function clean(text, { detectionThreshold = DEFAULT_DETECTION_THRESHOLD, targetDensity = DEFAULT_TARGET_DENSITY, } = {}) { |
|||
let cleaned = ""; |
|||
const effectiveTargetDensity = clamp(targetDensity); |
|||
for (const word of decompose(text).split(/(\s+)/)) { |
|||
if (isZalgo(word, detectionThreshold)) { |
|||
let cleanedWord = ""; |
|||
const letters = [...word].map((character) => { |
|||
return { |
|||
character, |
|||
isCandidate: categories.test(character), |
|||
}; |
|||
}); |
|||
for (let i = 0; i < letters.length; ++i) { |
|||
const { character, isCandidate } = letters[i]; |
|||
if (isCandidate) { |
|||
const admissionProjection = cleanedWord + word.substring(i); |
|||
const omissionProjection = cleanedWord + word.substring(i + 1); |
|||
const admissionDistance = effectiveTargetDensity - computeZalgoDensity(admissionProjection); |
|||
const omissionDistance = effectiveTargetDensity - computeZalgoDensity(omissionProjection); |
|||
if (Math.abs(omissionDistance) <= Math.abs(admissionDistance)) { |
|||
continue; |
|||
} |
|||
} |
|||
cleanedWord += character; |
|||
} |
|||
cleaned += cleanedWord; |
|||
} |
|||
else { |
|||
cleaned += word; |
|||
} |
|||
} |
|||
return compose(cleaned); |
|||
} |
|||
//# sourceMappingURL=unzalgo.js.map
|
@ -0,0 +1 @@ |
|||
{"version":3,"file":"unzalgo.js","sourceRoot":"./src/","sources":["unzalgo.ts"],"names":[],"mappings":"AAAA,aAAa;AACb,sCAAsC;AACtC,sBAAsB;AAEtB,YAAY,CAAC;AAEb,OAAO,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AAExC,MAAM,UAAU,GAAG,kBAAkB,CAAC;AACtC,MAAM,2BAA2B,GAAG,IAAI,CAAC;AACzC,MAAM,sBAAsB,GAAG,CAAC,CAAC;AACjC,MAAM,OAAO,GAAG,CAAC,IAAY,EAAE,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;AACxD,MAAM,SAAS,GAAG,CAAC,IAAY,EAAE,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;AAC1D,MAAM,mBAAmB,GAAG,CAAC,IAAY,EAAE,EAAE,CAC3C,CAAC,GAAG,IAAI,CAAC,CAAC,MAAM,CAAC,CAAC,SAAS,EAAE,EAAE,CAAC,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,MAAM;IAClE,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;AAC3B,MAAM,KAAK,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;AAEzD;;;;;;;GAOG;AACH,MAAM,UAAU,aAAa,CAAC,IAAY;IACxC,MAAM,UAAU,GAAkB,EAAE,CAAC;IACrC;;;OAGG;IACH,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,EAAE,CAAC;QACxB,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACrB,CAAC;SAAM,CAAC;QACN,KAAK,MAAM,IAAI,IAAI,SAAS,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;YAChD,IAAI,MAAM,GAAG,CAAC,CAAC;YACf,KAAK,MAAM,SAAS,IAAI,IAAI,EAAE,CAAC;gBAC7B,IAAI,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;oBAC/B,EAAE,MAAM,CAAC;gBACX,CAAC;YACH,CAAC;YACD,MAAM,KAAK,GAAG,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC;YACnC,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACzB,CAAC;IACH,CAAC;IACD,OAAO,UAAU,CAAC;AACpB,CAAC;AAED;;;;;;;;;;;;GAYG;AACH,MAAM,UAAU,OAAO,CACrB,IAAY,EACZ,qBAA6B,2BAA2B;IAExD,MAAM,UAAU,GAAG,aAAa,CAAC,IAAI,CAAC,CAAC;IACvC,MAAM,UAAU,GAAG,UAAU,CAAC,UAAU,EAAE,IAAI,CAAC,CAAC;IAChD,OAAO,UAAU,IAAI,KAAK,CAAC,kBAAkB,CAAC,CAAC;AACjD,CAAC;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,MAAM,UAAU,KAAK,CACnB,IAAY,EACZ,EACE,kBAAkB,GAAG,2BAA2B,EAChD,aAAa,GAAG,sBAAsB,GACvC,GAAG,EAAE;IAEN,IAAI,OAAO,GAAG,EAAE,CAAC;IACjB,MAAM,sBAAsB,GAAG,KAAK,CAAC,aAAa,CAAC,CAAC;IAOpD,KAAK,MAAM,IAAI,IAAI,SAAS,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,EAAE,CAAC;QAClD,IAAI,OAAO,CAAC,IAAI,EAAE,kBAAkB,CAAC,EAAE,CAAC;YACtC,IAAI,WAAW,GAAG,EAAE,CAAC;YACrB,MAAM,OAAO,GAAG,CAAC,GAAG,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,SAAS,EAAE,EAAE;gBAC1C,OAAO;oBACL,SAAS;oBACT,WAAW,EAAE,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC;iBACxC,CAAC;YACJ,CAAC,CAAC,CAAC;YACH,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,EAAE,CAAC;gBACxC,MAAM,EAAE,SAAS,EAAE,WAAW,EAAE,GAAG,OAAO,CAAC,CAAC,CAAoB,CAAC;gBACjE,IAAI,WAAW,EAAE,CAAC;oBAChB,MAAM,mBAAmB,GAAG,WAAW,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC;oBAC5D,MAAM,kBAAkB,GAAG,WAAW,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;oBAC/D,MAAM,iBAAiB,GACrB,sBAAsB,GAAG,mBAAmB,CAAC,mBAAmB,CAAC,CAAC;oBACpE,MAAM,gBAAgB,GACpB,sBAAsB,GAAG,mBAAmB,CAAC,kBAAkB,CAAC,CAAC;oBACnE,IAAI,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,IAAI,IAAI,CAAC,GAAG,CAAC,iBAAiB,CAAC,EAAE,CAAC;wBAC9D,SAAS;oBACX,CAAC;gBACH,CAAC;gBACD,WAAW,IAAI,SAAS,CAAC;YAC3B,CAAC;YACD,OAAO,IAAI,WAAW,CAAC;QACzB,CAAC;aAAM,CAAC;YACN,OAAO,IAAI,IAAI,CAAC;QAClB,CAAC;IACH,CAAC;IACD,OAAO,OAAO,CAAC,OAAO,CAAC,CAAC;AAC1B,CAAC"} |
@ -0,0 +1 @@ |
|||
export { cleanText, filterText } from "./dist/cleantext.js"; |
@ -0,0 +1,38 @@ |
|||
{ |
|||
"name": "dtp-cleantext", |
|||
"version": "1.0.0", |
|||
"description": "Text filtering and safety to make garbage people very sad when they can't hack your node.", |
|||
"type": "module", |
|||
"main": "index.js", |
|||
"scripts": { |
|||
"build": "tsc", |
|||
"test": "echo \"Error: no test specified\" && exit 1" |
|||
}, |
|||
"types": "./types/dtp-cleantext.d.ts", |
|||
"keywords": [ |
|||
"xss", |
|||
"sanitize", |
|||
"filter", |
|||
"clean" |
|||
], |
|||
"author": { |
|||
"name": "Rob Colbert", |
|||
"email": "[email protected]", |
|||
"url": "https://digitaltelepresence.com/" |
|||
}, |
|||
"license": "ISC", |
|||
"packageManager": "[email protected]", |
|||
"dependencies": { |
|||
"diacritics": "^1.3.0", |
|||
"shoetest": "^1.2.2", |
|||
"stats-lite": "^2.2.0", |
|||
"striptags": "^3.2.0" |
|||
}, |
|||
"devDependencies": { |
|||
"@types/diacritics": "^1.3.3", |
|||
"@types/node": "^22.13.10", |
|||
"@types/stats-lite": "^2.2.2", |
|||
"ts-node": "^10.9.2", |
|||
"typescript": "^5.8.2" |
|||
} |
|||
} |
@ -0,0 +1,248 @@ |
|||
lockfileVersion: '9.0' |
|||
|
|||
settings: |
|||
autoInstallPeers: true |
|||
excludeLinksFromLockfile: false |
|||
|
|||
importers: |
|||
|
|||
.: |
|||
dependencies: |
|||
diacritics: |
|||
specifier: ^1.3.0 |
|||
version: 1.3.0 |
|||
shoetest: |
|||
specifier: ^1.2.2 |
|||
version: 1.2.2 |
|||
stats-lite: |
|||
specifier: ^2.2.0 |
|||
version: 2.2.0 |
|||
striptags: |
|||
specifier: ^3.2.0 |
|||
version: 3.2.0 |
|||
devDependencies: |
|||
'@types/diacritics': |
|||
specifier: ^1.3.3 |
|||
version: 1.3.3 |
|||
'@types/node': |
|||
specifier: ^22.13.10 |
|||
version: 22.13.10 |
|||
'@types/stats-lite': |
|||
specifier: ^2.2.2 |
|||
version: 2.2.2 |
|||
ts-node: |
|||
specifier: ^10.9.2 |
|||
version: 10.9.2(@types/[email protected])([email protected]) |
|||
typescript: |
|||
specifier: ^5.8.2 |
|||
version: 5.8.2 |
|||
|
|||
packages: |
|||
|
|||
'@cspotcode/[email protected]': |
|||
resolution: {integrity: sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==} |
|||
engines: {node: '>=12'} |
|||
|
|||
'@jridgewell/[email protected]': |
|||
resolution: {integrity: sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==} |
|||
engines: {node: '>=6.0.0'} |
|||
|
|||
'@jridgewell/[email protected]': |
|||
resolution: {integrity: sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==} |
|||
|
|||
'@jridgewell/[email protected]': |
|||
resolution: {integrity: sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==} |
|||
|
|||
'@tsconfig/[email protected]': |
|||
resolution: {integrity: sha512-DcRjDCujK/kCk/cUe8Xz8ZSpm8mS3mNNpta+jGCA6USEDfktlNvm1+IuZ9eTcDbNk41BHwpHHeW+N1lKCz4zOw==} |
|||
|
|||
'@tsconfig/[email protected]': |
|||
resolution: {integrity: sha512-cqefuRsh12pWyGsIoBKJA9luFu3mRxCA+ORZvA4ktLSzIuCUtWVxGIuXigEwO5/ywWFMZ2QEGKWvkZG1zDMTag==} |
|||
|
|||
'@tsconfig/[email protected]': |
|||
resolution: {integrity: sha512-ysT8mhdixWK6Hw3i1V2AeRqZ5WfXg1G43mqoYlM2nc6388Fq5jcXyr5mRsqViLx/GJYdoL0bfXD8nmF+Zn/Iow==} |
|||
|
|||
'@tsconfig/[email protected]': |
|||
resolution: {integrity: sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==} |
|||
|
|||
'@types/[email protected]': |
|||
resolution: {integrity: sha512-wt0tBItmBsOUVZ8+MCrkBMoVfH/EUZeTXwYSekVVYilZlGDYssREUR+sX72mHvl2IrbdCKgpYARXKh3awD2how==} |
|||
|
|||
'@types/[email protected]': |
|||
resolution: {integrity: sha512-I6LPUvlRH+O6VRUqYOcMudhaIdUVWfsjnZavnsraHvpBwaEyMN29ry+0UVJhImYL16xsscu0aske3yA+uPOWfw==} |
|||
|
|||
'@types/[email protected]': |
|||
resolution: {integrity: sha512-T+bzT53cbPbE0hMlCNZux1QuH6hQFNHIwRMTQCu3YPG0W7XUfeoULHl+TehJCjaxQx8cz4wlg5oQsOyG9LvZmA==} |
|||
|
|||
[email protected]: |
|||
resolution: {integrity: sha512-ueEepnujpqee2o5aIYnvHU6C0A42MNdsIDeqy5BydrkuC5R1ZuUFnm27EeFJGoEHJQgn3uleRvmTXaJgfXbt4g==} |
|||
engines: {node: '>=0.4.0'} |
|||
|
|||
[email protected]: |
|||
resolution: {integrity: sha512-OvQ/2pUDKmgfCg++xsTX1wGxfTaszcHVcTctW4UJB4hibJx2HXxxO5UmVgyjMa+ZDsiaf5wWLXYpRWMmBI0QHg==} |
|||
engines: {node: '>=0.4.0'} |
|||
hasBin: true |
|||
|
|||
[email protected]: |
|||
resolution: {integrity: sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==} |
|||
|
|||
[email protected]: |
|||
resolution: {integrity: sha512-dcKFX3jn0MpIaXjisoRvexIJVEKzaq7z2rZKxf+MSr9TkdmHmsU4m2lcLojrj/FHl8mk5VxMmYA+ftRkP/3oKQ==} |
|||
|
|||
[email protected]: |
|||
resolution: {integrity: sha512-wlwEkqcsaxvPJML+rDh/2iS824jbREk6DUMUKkEaSlxdYHeS43cClJtsWglvw2RfeXGm6ohKDqsXteJ5sP5enA==} |
|||
|
|||
[email protected]: |
|||
resolution: {integrity: sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A==} |
|||
engines: {node: '>=0.3.1'} |
|||
|
|||
[email protected]: |
|||
resolution: {integrity: sha512-pYxfDYpued//QpnLIm4Avk7rsNtAtQkUES2cwAYSvD/wd2pKD71gN2Ebj3e7klzXwjocvE8c5vx/1fxwpqmSxA==} |
|||
engines: {node: '>=4'} |
|||
|
|||
[email protected]: |
|||
resolution: {integrity: sha512-JLiSz/zsZcGFXPrB4I/AGBvtStkt+8QmksyZBZnVXnnK9XdTEyz0tX8CRYljtwYDuIuZzih6DpHQdi+3Q6zHPw==} |
|||
|
|||
[email protected]: |
|||
resolution: {integrity: sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw==} |
|||
|
|||
[email protected]: |
|||
resolution: {integrity: sha512-U+5l2KrcMNOUPYvazA3h5ekF80FHTUG+87SEAmHZmolh1M+i/WyTCxVzmi+tidIa1tM4BSe8g2Y/D3loWDjj+w==} |
|||
engines: {node: '>=4'} |
|||
|
|||
[email protected]: |
|||
resolution: {integrity: sha512-M0b3YWQs7R3Z917WRQy1HHA7Ba7D8hvZg6UE5mLykJxQVE2ju0IXbGlaHPPlkY+WN7wFP+wUMXmBFA0aV6vYGQ==} |
|||
engines: {node: '>=4'} |
|||
|
|||
[email protected]: |
|||
resolution: {integrity: sha512-iT8kIEFcGfUwo53VUFckm+glTkc0oLycRe+YqU/W4wQuIHGIWc5KMIpDnJVdavKCyEZKQTi8IDq27rDmB09QjA==} |
|||
|
|||
[email protected]: |
|||
resolution: {integrity: sha512-/Kz55rgUIv2KP2MKphwYT/NCuSfAlbbMRv2ZWw7wyXayu230zdtzhxxuXXcvsc6EmmhS8bSJl3uS1wmMHFumbA==} |
|||
engines: {node: '>=2.0.0'} |
|||
|
|||
[email protected]: |
|||
resolution: {integrity: sha512-g45ZOGzHDMe2bdYMdIvdAfCQkCTDMGBazSw1ypMowwGIee7ZQ5dU0rBJ8Jqgl+jAKIv4dbeE1jscZq9wid1Tkw==} |
|||
|
|||
[email protected]: |
|||
resolution: {integrity: sha512-f0FFpIdcHgn8zcPSbf1dRevwt047YMnaiJM3u2w2RewrB+fob/zePZcrOyQoLMMO7aBIddLcQIEK5dYjkLnGrQ==} |
|||
hasBin: true |
|||
peerDependencies: |
|||
'@swc/core': '>=1.2.50' |
|||
'@swc/wasm': '>=1.2.50' |
|||
'@types/node': '*' |
|||
typescript: '>=2.7' |
|||
peerDependenciesMeta: |
|||
'@swc/core': |
|||
optional: true |
|||
'@swc/wasm': |
|||
optional: true |
|||
|
|||
[email protected]: |
|||
resolution: {integrity: sha512-aJn6wq13/afZp/jT9QZmwEjDqqvSGp1VT5GVg+f/t6/oVyrgXM6BY1h9BRh/O5p3PlUPAe+WuiEZOmb/49RqoQ==} |
|||
engines: {node: '>=14.17'} |
|||
hasBin: true |
|||
|
|||
[email protected]: |
|||
resolution: {integrity: sha512-Ny6QZ2Nju20vw1SRHe3d9jVu6gJ+4e3+MMpqu7pqE5HT6WsTSlce++GQmK5UXS8mzV8DSYHrQH+Xrf2jVcuKNg==} |
|||
|
|||
[email protected]: |
|||
resolution: {integrity: sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==} |
|||
|
|||
[email protected]: |
|||
resolution: {integrity: sha512-Ux4ygGWsu2c7isFWe8Yu1YluJmqVhxqK2cLXNQA5AcC3QfbGNpM7fu0Y8b/z16pXLnFxZYvWhd3fhBY9DLmC6Q==} |
|||
engines: {node: '>=6'} |
|||
|
|||
snapshots: |
|||
|
|||
'@cspotcode/[email protected]': |
|||
dependencies: |
|||
'@jridgewell/trace-mapping': 0.3.9 |
|||
|
|||
'@jridgewell/[email protected]': {} |
|||
|
|||
'@jridgewell/[email protected]': {} |
|||
|
|||
'@jridgewell/[email protected]': |
|||
dependencies: |
|||
'@jridgewell/resolve-uri': 3.1.2 |
|||
'@jridgewell/sourcemap-codec': 1.5.0 |
|||
|
|||
'@tsconfig/[email protected]': {} |
|||
|
|||
'@tsconfig/[email protected]': {} |
|||
|
|||
'@tsconfig/[email protected]': {} |
|||
|
|||
'@tsconfig/[email protected]': {} |
|||
|
|||
'@types/[email protected]': {} |
|||
|
|||
'@types/[email protected]': |
|||
dependencies: |
|||
undici-types: 6.20.0 |
|||
|
|||
'@types/[email protected]': {} |
|||
|
|||
[email protected]: |
|||
dependencies: |
|||
acorn: 8.14.1 |
|||
|
|||
[email protected]: {} |
|||
|
|||
[email protected]: {} |
|||
|
|||
[email protected]: {} |
|||
|
|||
[email protected]: {} |
|||
|
|||
[email protected]: {} |
|||
|
|||
[email protected]: {} |
|||
|
|||
[email protected]: {} |
|||
|
|||
[email protected]: {} |
|||
|
|||
[email protected]: |
|||
dependencies: |
|||
drange: 1.1.1 |
|||
ret: 0.2.2 |
|||
|
|||
[email protected]: {} |
|||
|
|||
[email protected]: |
|||
dependencies: |
|||
randexp: 0.5.3 |
|||
|
|||
[email protected]: |
|||
dependencies: |
|||
isnumber: 1.0.0 |
|||
|
|||
[email protected]: {} |
|||
|
|||
[email protected](@types/[email protected])([email protected]): |
|||
dependencies: |
|||
'@cspotcode/source-map-support': 0.8.1 |
|||
'@tsconfig/node10': 1.0.11 |
|||
'@tsconfig/node12': 1.0.11 |
|||
'@tsconfig/node14': 1.0.3 |
|||
'@tsconfig/node16': 1.0.4 |
|||
'@types/node': 22.13.10 |
|||
acorn: 8.14.1 |
|||
acorn-walk: 8.3.4 |
|||
arg: 4.1.3 |
|||
create-require: 1.1.1 |
|||
diff: 4.0.2 |
|||
make-error: 1.3.6 |
|||
typescript: 5.8.2 |
|||
v8-compile-cache-lib: 3.0.1 |
|||
yn: 3.1.1 |
|||
|
|||
[email protected]: {} |
|||
|
|||
[email protected]: {} |
|||
|
|||
[email protected]: {} |
|||
|
|||
[email protected]: {} |
@ -0,0 +1,54 @@ |
|||
// cleantext.ts
|
|||
// Copyright (C) DTP Technologies, LLC
|
|||
// License: Apache-2.0
|
|||
|
|||
import { createRequire } from "module"; |
|||
const require = createRequire(import.meta.url); // jshint ignore:line
|
|||
|
|||
import WebTextFilter from "./lib/edit-with-vi.js"; |
|||
import { clean } from "./unzalgo.js"; |
|||
|
|||
import striptags from "striptags"; |
|||
import diacritics from "diacritics"; |
|||
|
|||
const shoetest = require("shoetest"); |
|||
|
|||
/** |
|||
* Basic text cleaning function to remove Zalgo and tags. |
|||
* @param text string The text to be cleaned |
|||
* @returns The cleaned text |
|||
*/ |
|||
export function cleanText(text: string): string { |
|||
text = clean(text); |
|||
text = striptags(text.trim()); |
|||
return text; |
|||
} |
|||
|
|||
/** |
|||
* The heavy hammer of text filtering that removes all malicious and annoying |
|||
* things I know about as of this writing. Zalgo, tags, shoetest, diacritics, |
|||
* and our own custom nonsense UTF-8 and Unicode filters. |
|||
* |
|||
* This filter is very heavy-handed and merciless. |
|||
* |
|||
* @param text string The text to be filtered |
|||
* @returns The filtered text |
|||
*/ |
|||
export function filterText(text: string): string { |
|||
if (!text || typeof text !== "string" || text.length < 1) { |
|||
return text; |
|||
} |
|||
|
|||
text = WebTextFilter.filterNonsense(text); |
|||
text = WebTextFilter.filterGuff(text); |
|||
text = WebTextFilter.filterHtml(text); |
|||
|
|||
text = shoetest.simplify(text); |
|||
text = diacritics.remove(text); |
|||
|
|||
/* |
|||
* Once all the stupidity has been stripped, strip the HTML |
|||
* tags that might remain. |
|||
*/ |
|||
return clean(text); |
|||
} |
@ -0,0 +1,77 @@ |
|||
// edit-with-vi.ts
|
|||
// Copyright (C) 2025 DTP Technologies, LLC
|
|||
// All Rights Reserved
|
|||
|
|||
// Based on:
|
|||
// https://github.com/voidful/text-filtering-js/blob/master/text_filtering.js
|
|||
// - Does not extend String because stop it.
|
|||
// - CommonJS module
|
|||
|
|||
'use strict'; |
|||
|
|||
/* |
|||
* This file must only be edited with vi/vim. If you so much as *open* this file |
|||
* in VSCode, you've probably damaged the file. Do not save it. Just close it, |
|||
* and go edit the file with vi or vim. |
|||
* |
|||
* VS Code, being web-based, contains logic to filter out the content used to |
|||
* implement the filter. You will erase that content, and then various attackers |
|||
* will own your chat. |
|||
* |
|||
* If attackers have owned your chat, you may want to revert or otherwise restore |
|||
* this file to it's original state. |
|||
*/ |
|||
|
|||
export function filterBBcode (text: string) : string { |
|||
return text.replace(/\[.*\]/g, ''); |
|||
} |
|||
|
|||
export function filterLineBreak (text: string) : string { |
|||
return text.replace(/(\r\n|\n|\r)/gm," "); |
|||
} |
|||
|
|||
export function filterSmileysCode (text: string) : string { |
|||
return text |
|||
.replace(/:\$?.*:\$?/g, '') |
|||
.replace(/:\w+:?/g, '') |
|||
.replace(/:\w+/g, '') |
|||
.replace(/&#.*;/g, '') |
|||
; |
|||
} |
|||
|
|||
export function filterGuff (text: string) : string { |
|||
return text.replace('*** 作者被禁止或刪除 內容自動屏蔽 ***', ''); |
|||
} |
|||
|
|||
export function filterHtml (text: string) : string { |
|||
return text.replace(/(<[^>]*>)/g,' '); |
|||
} |
|||
|
|||
export function filterNonsense (text: string) : string { |
|||
// edited to allow CR and LF
|
|||
// text = text.replace(/[\u0000-\u001F\u007f\u00AD\u200B-\u200D\u3000\uFEFF]/g,'');
|
|||
text = text.replace(/[\u0000-\u0009\u000b\u000c\u000e\u007f\u00AD\u200B-\u200D\u3000\uFEFF]/g,''); |
|||
|
|||
text = text.replace(/\u00AD/,' '); |
|||
text = text.replace(/\u2013/,'-'); |
|||
return text; |
|||
} |
|||
|
|||
export function filterAll (text: string) : string { |
|||
text = module.exports.filterSmileysCode(text); |
|||
text = module.exports.filterBBcode(text); |
|||
text = module.exports.filterGuff(text); |
|||
text = module.exports.filterHtml(text); |
|||
text = module.exports.filterLineBreak(text); |
|||
return text; |
|||
} |
|||
|
|||
export default { |
|||
filterBBcode, |
|||
filterLineBreak, |
|||
filterSmileysCode, |
|||
filterGuff, |
|||
filterHtml, |
|||
filterNonsense, |
|||
filterAll, |
|||
}; |
@ -0,0 +1,138 @@ |
|||
// unzalgo.ts
|
|||
// Copyright (C) DTP Technologies, LLC
|
|||
// License: Apache-2.0
|
|||
|
|||
"use strict"; |
|||
|
|||
import { percentile } from "stats-lite"; |
|||
|
|||
const categories = /[\p{Mn}\p{Me}]+/u; |
|||
const DEFAULT_DETECTION_THRESHOLD = 0.55; |
|||
const DEFAULT_TARGET_DENSITY = 0; |
|||
const compose = (text: string) => text.normalize("NFC"); |
|||
const decompose = (text: string) => text.normalize("NFD"); |
|||
const computeZalgoDensity = (text: string) => |
|||
[...text].filter((character) => categories.test(character)).length / |
|||
Math.max(text.length, 1); |
|||
const clamp = (x: number) => Math.max(Math.min(x, 1), 0); |
|||
|
|||
/** |
|||
* Computes a score ∈ [0, 1] for every word in the input string. Each score |
|||
* represents the ratio of combining characters to total characters in a word. |
|||
* |
|||
* @param text string The input string for which to compute scores. |
|||
* @return Array<number> An array of scores where each score describes the |
|||
* Zalgo ratio of a word. |
|||
*/ |
|||
export function computeScores(text: string): Array<number> { |
|||
const wordScores: Array<number> = []; |
|||
/** |
|||
* Trimming here allows us to return early. |
|||
* Without trimming, we risk dividing by `0` later when computing the score. |
|||
*/ |
|||
if (!text.trim().length) { |
|||
wordScores.push(0); |
|||
} else { |
|||
for (const word of decompose(text).split(/\s+/)) { |
|||
let banned = 0; |
|||
for (const character of word) { |
|||
if (categories.test(character)) { |
|||
++banned; |
|||
} |
|||
} |
|||
const score = banned / word.length; |
|||
wordScores.push(score); |
|||
} |
|||
} |
|||
return wordScores; |
|||
} |
|||
|
|||
/** |
|||
* Determines if the string consists of Zalgo text. Note that the occurrence |
|||
* of a combining character is not enough to trigger the detection. Instead, |
|||
* it computes a ratio for the input string and checks if it exceeds a given |
|||
* threshold. Thus, internationalized strings aren't automatically classified |
|||
* as Zalgo text. |
|||
* |
|||
* @param text string A string for which a Zalgo text check is run. |
|||
* @param detectionThreshold number A threshold ∈ [0, 1]. The higher the |
|||
* threshold, the more combining characters are needed for it to be detected |
|||
* as Zalgo text. |
|||
* @return boolean Whether the string is a Zalgo text string. |
|||
*/ |
|||
export function isZalgo( |
|||
text: string, |
|||
detectionThreshold: number = DEFAULT_DETECTION_THRESHOLD |
|||
): boolean { |
|||
const wordScores = computeScores(text); |
|||
const totalScore = percentile(wordScores, 0.75); |
|||
return totalScore >= clamp(detectionThreshold); |
|||
} |
|||
|
|||
/** |
|||
* Removes all combining characters for every word in a string if the word is |
|||
* classified as Zalgo text. |
|||
* |
|||
* If `targetDensity` is specified, not all the Zalgo characters will be |
|||
* removed. Instead, they will be thinned out uniformly. |
|||
* |
|||
* @param text string |
|||
* A string for which combining characters are removed for every word whose |
|||
* Zalgo property is met. |
|||
* @param options object Options for cleaning. |
|||
* @param options.detectionThreshold number |
|||
* A threshold ∈ [0, 1]. The higher the threshold, the more combining |
|||
* characters are needed for it to be detected as Zalgo text. |
|||
* @param options.targetDensity number |
|||
* A threshold ∈ [0, 1]. The higher the density, the more Zalgo characters |
|||
* will be part of the resulting string. The result is guaranteed to have a |
|||
* Zalgo-character density that is less than or equal to the one provided. |
|||
* @return string |
|||
* A cleaned, more readable string. |
|||
*/ |
|||
export function clean( |
|||
text: string, |
|||
{ |
|||
detectionThreshold = DEFAULT_DETECTION_THRESHOLD, |
|||
targetDensity = DEFAULT_TARGET_DENSITY, |
|||
} = {} |
|||
) { |
|||
let cleaned = ""; |
|||
const effectiveTargetDensity = clamp(targetDensity); |
|||
|
|||
type CharacterRecord = { |
|||
character: string; |
|||
isCandidate: boolean; |
|||
}; |
|||
|
|||
for (const word of decompose(text).split(/(\s+)/)) { |
|||
if (isZalgo(word, detectionThreshold)) { |
|||
let cleanedWord = ""; |
|||
const letters = [...word].map((character) => { |
|||
return { |
|||
character, |
|||
isCandidate: categories.test(character), |
|||
}; |
|||
}); |
|||
for (let i = 0; i < letters.length; ++i) { |
|||
const { character, isCandidate } = letters[i] as CharacterRecord; |
|||
if (isCandidate) { |
|||
const admissionProjection = cleanedWord + word.substring(i); |
|||
const omissionProjection = cleanedWord + word.substring(i + 1); |
|||
const admissionDistance = |
|||
effectiveTargetDensity - computeZalgoDensity(admissionProjection); |
|||
const omissionDistance = |
|||
effectiveTargetDensity - computeZalgoDensity(omissionProjection); |
|||
if (Math.abs(omissionDistance) <= Math.abs(admissionDistance)) { |
|||
continue; |
|||
} |
|||
} |
|||
cleanedWord += character; |
|||
} |
|||
cleaned += cleanedWord; |
|||
} else { |
|||
cleaned += word; |
|||
} |
|||
} |
|||
return compose(cleaned); |
|||
} |
@ -0,0 +1,37 @@ |
|||
{ |
|||
"compilerOptions": { |
|||
"target": "ES2022", |
|||
"lib": ["es2022", "dom"], |
|||
"experimentalDecorators": true, |
|||
"emitDecoratorMetadata": true, |
|||
"module": "ESNext", |
|||
"rootDir": "./src", |
|||
"moduleResolution": "node", |
|||
"baseUrl": "./src", |
|||
"typeRoots": ["node_modules/@types", "types"], |
|||
"allowImportingTsExtensions": true, |
|||
"rewriteRelativeImportExtensions": true, |
|||
"resolveJsonModule": true, |
|||
"declaration": false, |
|||
"sourceMap": true, |
|||
"outDir": "dist", |
|||
"removeComments": false, |
|||
"importHelpers": true, |
|||
"sourceRoot": "./src", |
|||
"esModuleInterop": true, |
|||
"forceConsistentCasingInFileNames": true, |
|||
"strict": true, |
|||
"alwaysStrict": true, |
|||
"noUnusedLocals": true, |
|||
"noUnusedParameters": true, |
|||
"noImplicitReturns": true, |
|||
"noFallthroughCasesInSwitch": true, |
|||
"noUncheckedIndexedAccess": true, |
|||
"pretty": true, |
|||
"skipDefaultLibCheck": true, |
|||
"skipLibCheck": true |
|||
}, |
|||
"include": ["src/**/*.ts"], |
|||
"exclude": ["node_modules", "docs"], |
|||
"files": ["types/dtp-cleantext.d.ts"] |
|||
} |
@ -0,0 +1,4 @@ |
|||
declare module "dtp-cleantext" { |
|||
export function cleanText(text: string): string; |
|||
export function filterText(text: string): string; |
|||
} |
Loading…
Reference in new issue