Browse Source

project created

develop
Rob Colbert 1 month ago
parent
commit
e311aab85a
  1. 1
      .gitignore
  2. 13
      LICENSE
  3. 26
      README.md
  4. 46
      dist/cleantext.js
  5. 1
      dist/cleantext.js.map
  6. 65
      dist/lib/edit-with-vi.js
  7. 1
      dist/lib/edit-with-vi.js.map
  8. 117
      dist/unzalgo.js
  9. 1
      dist/unzalgo.js.map
  10. 1
      index.js
  11. 38
      package.json
  12. 248
      pnpm-lock.yaml
  13. 54
      src/cleantext.ts
  14. 77
      src/lib/edit-with-vi.ts
  15. 138
      src/unzalgo.ts
  16. 37
      tsconfig.json
  17. 4
      types/dtp-cleantext.d.ts

1
.gitignore

@ -0,0 +1 @@
node_modules

13
LICENSE

@ -0,0 +1,13 @@
Copyright 2025 DTP Technologies, LLC
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

26
README.md

@ -0,0 +1,26 @@
# DTP CleanText
We hate "hackers" and chucklefucks _this_ much. For real for real. Cap.
## cleanText
Performs an unzalgo and striptags on the input text.
## Usage
```javascript
var DTP = require("dtp-cleantext");
const clean = DTP.cleanText(input);
```
## filterText
A more comprehensive and complete filtering of input text that includes the filtering of nonsense, guff, HTML. Then, it performs a shoetest simplification, and removes all diacritics. It finishes with a call to `cleanText` for convenience.
## Usage
```javascript
var DTP = require("dtp-cleantext");
const filtered = DTP.filterText(input, options);
```

46
dist/cleantext.js

@ -0,0 +1,46 @@
// cleantext.ts
// Copyright (C) DTP Technologies, LLC
// License: Apache-2.0
import { createRequire } from "module";
const require = createRequire(import.meta.url); // jshint ignore:line
import WebTextFilter from "./lib/edit-with-vi.js";
import { clean } from "./unzalgo.js";
import striptags from "striptags";
import diacritics from "diacritics";
const shoetest = require("shoetest");
/**
* Basic text cleaning function to remove Zalgo and tags.
* @param text string The text to be cleaned
* @returns The cleaned text
*/
export function cleanText(text) {
text = clean(text);
text = striptags(text.trim());
return text;
}
/**
* The heavy hammer of text filtering that removes all malicious and annoying
* things I know about as of this writing. Zalgo, tags, shoetest, diacritics,
* and our own custom nonsense UTF-8 and Unicode filters.
*
* This filter is very heavy-handed and merciless.
*
* @param text string The text to be filtered
* @returns The filtered text
*/
export function filterText(text) {
if (!text || typeof text !== "string" || text.length < 1) {
return text;
}
text = WebTextFilter.filterNonsense(text);
text = WebTextFilter.filterGuff(text);
text = WebTextFilter.filterHtml(text);
text = shoetest.simplify(text);
text = diacritics.remove(text);
/*
* Once all the stupidity has been stripped, strip the HTML
* tags that might remain.
*/
return clean(text);
}
//# sourceMappingURL=cleantext.js.map

1
dist/cleantext.js.map

@ -0,0 +1 @@
{"version":3,"file":"cleantext.js","sourceRoot":"./src/","sources":["cleantext.ts"],"names":[],"mappings":"AAAA,eAAe;AACf,sCAAsC;AACtC,sBAAsB;AAEtB,OAAO,EAAE,aAAa,EAAE,MAAM,QAAQ,CAAC;AACvC,MAAM,OAAO,GAAG,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,qBAAqB;AAErE,OAAO,aAAa,MAAM,uBAAuB,CAAC;AAClD,OAAO,EAAE,KAAK,EAAE,MAAM,cAAc,CAAC;AAErC,OAAO,SAAS,MAAM,WAAW,CAAC;AAClC,OAAO,UAAU,MAAM,YAAY,CAAC;AAEpC,MAAM,QAAQ,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;AAErC;;;;GAIG;AACH,MAAM,UAAU,SAAS,CAAC,IAAY;IACpC,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,CAAC;IACnB,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC;IAC9B,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;;;;;;;GASG;AACH,MAAM,UAAU,UAAU,CAAC,IAAY;IACrC,IAAI,CAAC,IAAI,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACzD,OAAO,IAAI,CAAC;IACd,CAAC;IAED,IAAI,GAAG,aAAa,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;IAC1C,IAAI,GAAG,aAAa,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;IACtC,IAAI,GAAG,aAAa,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;IAEtC,IAAI,GAAG,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC/B,IAAI,GAAG,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAE/B;;;OAGG;IACH,OAAO,KAAK,CAAC,IAAI,CAAC,CAAC;AACrB,CAAC"}

65
dist/lib/edit-with-vi.js

@ -0,0 +1,65 @@
// edit-with-vi.ts
// Copyright (C) 2025 DTP Technologies, LLC
// All Rights Reserved
// Based on:
// https://github.com/voidful/text-filtering-js/blob/master/text_filtering.js
// - Does not extend String because stop it.
// - CommonJS module
'use strict';
/*
* This file must only be edited with vi/vim. If you so much as *open* this file
* in VSCode, you've probably damaged the file. Do not save it. Just close it,
* and go edit the file with vi or vim.
*
* VS Code, being web-based, contains logic to filter out the content used to
* implement the filter. You will erase that content, and then various attackers
* will own your chat.
*
* If attackers have owned your chat, you may want to revert or otherwise restore
* this file to it's original state.
*/
export function filterBBcode(text) {
return text.replace(/\[.*\]/g, '');
}
export function filterLineBreak(text) {
return text.replace(/(\r\n|\n|\r)/gm, " ");
}
export function filterSmileysCode(text) {
return text
.replace(/:\$?.*:\$?/g, '')
.replace(/:\w+:?/g, '')
.replace(/:\w+/g, '')
.replace(/&#.*;/g, '');
}
export function filterGuff(text) {
return text.replace('*** 作者被禁止或刪除 內容自動屏蔽 ***', '');
}
export function filterHtml(text) {
return text.replace(/(<[^>]*>)/g, ' ');
}
export function filterNonsense(text) {
// edited to allow CR and LF
// text = text.replace(/[\u0000-\u001F\u007f\u00AD\u200B-\u200D\u3000\uFEFF]/g,'');
text = text.replace(/[\u0000-\u0009\u000b\u000c\u000e\u007f\u00AD\u200B-\u200D\u3000\uFEFF]/g, '');
text = text.replace(/\u00AD/, ' ');
text = text.replace(/\u2013/, '-');
return text;
}
export function filterAll(text) {
text = module.exports.filterSmileysCode(text);
text = module.exports.filterBBcode(text);
text = module.exports.filterGuff(text);
text = module.exports.filterHtml(text);
text = module.exports.filterLineBreak(text);
return text;
}
export default {
filterBBcode,
filterLineBreak,
filterSmileysCode,
filterGuff,
filterHtml,
filterNonsense,
filterAll,
};
//# sourceMappingURL=edit-with-vi.js.map

1
dist/lib/edit-with-vi.js.map

@ -0,0 +1 @@
{"version":3,"file":"edit-with-vi.js","sourceRoot":"./src/","sources":["lib/edit-with-vi.ts"],"names":[],"mappings":"AAAA,kBAAkB;AAClB,2CAA2C;AAC3C,sBAAsB;AAEtB,YAAY;AACZ,6EAA6E;AAC7E,4CAA4C;AAC5C,oBAAoB;AAEpB,YAAY,CAAC;AAEb;;;;;;;;;;;GAWG;AAEH,MAAM,UAAU,YAAY,CAAE,IAAY;IACxC,OAAO,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;AACrC,CAAC;AAED,MAAM,UAAU,eAAe,CAAE,IAAY;IAC3C,OAAO,IAAI,CAAC,OAAO,CAAC,gBAAgB,EAAC,GAAG,CAAC,CAAC;AAC5C,CAAC;AAED,MAAM,UAAU,iBAAiB,CAAE,IAAY;IAC7C,OAAO,IAAI;SACR,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC;SAC1B,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC;SACtB,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC;SACpB,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CACvB;AACH,CAAC;AAED,MAAM,UAAU,UAAU,CAAE,IAAY;IACtC,OAAO,IAAI,CAAC,OAAO,CAAC,yBAAyB,EAAE,EAAE,CAAC,CAAC;AACrD,CAAC;AAED,MAAM,UAAU,UAAU,CAAE,IAAY;IACtC,OAAO,IAAI,CAAC,OAAO,CAAC,YAAY,EAAC,GAAG,CAAC,CAAC;AACxC,CAAC;AAED,MAAM,UAAU,cAAc,CAAE,IAAY;IAC1C,4BAA4B;IAC5B,mFAAmF;IACnF,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,yEAAyE,EAAC,EAAE,CAAC,CAAC;IAElG,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAC,GAAG,CAAC,CAAC;IAClC,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAC,GAAG,CAAC,CAAC;IAClC,OAAO,IAAI,CAAC;AACd,CAAC;AAED,MAAM,UAAU,SAAS,CAAE,IAAY;IACrC,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC;IAC9C,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;IACzC,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;IACvC,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;IACvC,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC;IAC5C,OAAO,IAAI,CAAC;AACd,CAAC;AAED,eAAe;IACb,YAAY;IACZ,eAAe;IACf,iBAAiB;IACjB,UAAU;IACV,UAAU;IACV,cAAc;IACd,SAAS;CACV,CAAC"}

117
dist/unzalgo.js

@ -0,0 +1,117 @@
// unzalgo.ts
// Copyright (C) DTP Technologies, LLC
// License: Apache-2.0
"use strict";
import { percentile } from "stats-lite";
const categories = /[\p{Mn}\p{Me}]+/u;
const DEFAULT_DETECTION_THRESHOLD = 0.55;
const DEFAULT_TARGET_DENSITY = 0;
const compose = (text) => text.normalize("NFC");
const decompose = (text) => text.normalize("NFD");
const computeZalgoDensity = (text) => [...text].filter((character) => categories.test(character)).length /
Math.max(text.length, 1);
const clamp = (x) => Math.max(Math.min(x, 1), 0);
/**
* Computes a score [0, 1] for every word in the input string. Each score
* represents the ratio of combining characters to total characters in a word.
*
* @param text string The input string for which to compute scores.
* @return Array<number> An array of scores where each score describes the
* Zalgo ratio of a word.
*/
export function computeScores(text) {
const wordScores = [];
/**
* Trimming here allows us to return early.
* Without trimming, we risk dividing by `0` later when computing the score.
*/
if (!text.trim().length) {
wordScores.push(0);
}
else {
for (const word of decompose(text).split(/\s+/)) {
let banned = 0;
for (const character of word) {
if (categories.test(character)) {
++banned;
}
}
const score = banned / word.length;
wordScores.push(score);
}
}
return wordScores;
}
/**
* Determines if the string consists of Zalgo text. Note that the occurrence
* of a combining character is not enough to trigger the detection. Instead,
* it computes a ratio for the input string and checks if it exceeds a given
* threshold. Thus, internationalized strings aren't automatically classified
* as Zalgo text.
*
* @param text string A string for which a Zalgo text check is run.
* @param detectionThreshold number A threshold [0, 1]. The higher the
* threshold, the more combining characters are needed for it to be detected
* as Zalgo text.
* @return boolean Whether the string is a Zalgo text string.
*/
export function isZalgo(text, detectionThreshold = DEFAULT_DETECTION_THRESHOLD) {
const wordScores = computeScores(text);
const totalScore = percentile(wordScores, 0.75);
return totalScore >= clamp(detectionThreshold);
}
/**
* Removes all combining characters for every word in a string if the word is
* classified as Zalgo text.
*
* If `targetDensity` is specified, not all the Zalgo characters will be
* removed. Instead, they will be thinned out uniformly.
*
* @param text string
* A string for which combining characters are removed for every word whose
* Zalgo property is met.
* @param options object Options for cleaning.
* @param options.detectionThreshold number
* A threshold [0, 1]. The higher the threshold, the more combining
* characters are needed for it to be detected as Zalgo text.
* @param options.targetDensity number
* A threshold [0, 1]. The higher the density, the more Zalgo characters
* will be part of the resulting string. The result is guaranteed to have a
* Zalgo-character density that is less than or equal to the one provided.
* @return string
* A cleaned, more readable string.
*/
export function clean(text, { detectionThreshold = DEFAULT_DETECTION_THRESHOLD, targetDensity = DEFAULT_TARGET_DENSITY, } = {}) {
let cleaned = "";
const effectiveTargetDensity = clamp(targetDensity);
for (const word of decompose(text).split(/(\s+)/)) {
if (isZalgo(word, detectionThreshold)) {
let cleanedWord = "";
const letters = [...word].map((character) => {
return {
character,
isCandidate: categories.test(character),
};
});
for (let i = 0; i < letters.length; ++i) {
const { character, isCandidate } = letters[i];
if (isCandidate) {
const admissionProjection = cleanedWord + word.substring(i);
const omissionProjection = cleanedWord + word.substring(i + 1);
const admissionDistance = effectiveTargetDensity - computeZalgoDensity(admissionProjection);
const omissionDistance = effectiveTargetDensity - computeZalgoDensity(omissionProjection);
if (Math.abs(omissionDistance) <= Math.abs(admissionDistance)) {
continue;
}
}
cleanedWord += character;
}
cleaned += cleanedWord;
}
else {
cleaned += word;
}
}
return compose(cleaned);
}
//# sourceMappingURL=unzalgo.js.map

1
dist/unzalgo.js.map

@ -0,0 +1 @@
{"version":3,"file":"unzalgo.js","sourceRoot":"./src/","sources":["unzalgo.ts"],"names":[],"mappings":"AAAA,aAAa;AACb,sCAAsC;AACtC,sBAAsB;AAEtB,YAAY,CAAC;AAEb,OAAO,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AAExC,MAAM,UAAU,GAAG,kBAAkB,CAAC;AACtC,MAAM,2BAA2B,GAAG,IAAI,CAAC;AACzC,MAAM,sBAAsB,GAAG,CAAC,CAAC;AACjC,MAAM,OAAO,GAAG,CAAC,IAAY,EAAE,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;AACxD,MAAM,SAAS,GAAG,CAAC,IAAY,EAAE,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;AAC1D,MAAM,mBAAmB,GAAG,CAAC,IAAY,EAAE,EAAE,CAC3C,CAAC,GAAG,IAAI,CAAC,CAAC,MAAM,CAAC,CAAC,SAAS,EAAE,EAAE,CAAC,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,MAAM;IAClE,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;AAC3B,MAAM,KAAK,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;AAEzD;;;;;;;GAOG;AACH,MAAM,UAAU,aAAa,CAAC,IAAY;IACxC,MAAM,UAAU,GAAkB,EAAE,CAAC;IACrC;;;OAGG;IACH,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,EAAE,CAAC;QACxB,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACrB,CAAC;SAAM,CAAC;QACN,KAAK,MAAM,IAAI,IAAI,SAAS,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;YAChD,IAAI,MAAM,GAAG,CAAC,CAAC;YACf,KAAK,MAAM,SAAS,IAAI,IAAI,EAAE,CAAC;gBAC7B,IAAI,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;oBAC/B,EAAE,MAAM,CAAC;gBACX,CAAC;YACH,CAAC;YACD,MAAM,KAAK,GAAG,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC;YACnC,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACzB,CAAC;IACH,CAAC;IACD,OAAO,UAAU,CAAC;AACpB,CAAC;AAED;;;;;;;;;;;;GAYG;AACH,MAAM,UAAU,OAAO,CACrB,IAAY,EACZ,qBAA6B,2BAA2B;IAExD,MAAM,UAAU,GAAG,aAAa,CAAC,IAAI,CAAC,CAAC;IACvC,MAAM,UAAU,GAAG,UAAU,CAAC,UAAU,EAAE,IAAI,CAAC,CAAC;IAChD,OAAO,UAAU,IAAI,KAAK,CAAC,kBAAkB,CAAC,CAAC;AACjD,CAAC;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,MAAM,UAAU,KAAK,CACnB,IAAY,EACZ,EACE,kBAAkB,GAAG,2BAA2B,EAChD,aAAa,GAAG,sBAAsB,GACvC,GAAG,EAAE;IAEN,IAAI,OAAO,GAAG,EAAE,CAAC;IACjB,MAAM,sBAAsB,GAAG,KAAK,CAAC,aAAa,CAAC,CAAC;IAOpD,KAAK,MAAM,IAAI,IAAI,SAAS,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,EAAE,CAAC;QAClD,IAAI,OAAO,CAAC,IAAI,EAAE,kBAAkB,CAAC,EAAE,CAAC;YACtC,IAAI,WAAW,GAAG,EAAE,CAAC;YACrB,MAAM,OAAO,GAAG,CAAC,GAAG,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,SAAS,EAAE,EAAE;gBAC1C,OAAO;oBACL,SAAS;oBACT,WAAW,EAAE,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC;iBACxC,CAAC;YACJ,CAAC,CAAC,CAAC;YACH,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,EAAE,CAAC;gBACxC,MAAM,EAAE,SAAS,EAAE,WAAW,EAAE,GAAG,OAAO,CAAC,CAAC,CAAoB,CAAC;gBACjE,IAAI,WAAW,EAAE,CAAC;oBAChB,MAAM,mBAAmB,GAAG,WAAW,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC;oBAC5D,MAAM,kBAAkB,GAAG,WAAW,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;oBAC/D,MAAM,iBAAiB,GACrB,sBAAsB,GAAG,mBAAmB,CAAC,mBAAmB,CAAC,CAAC;oBACpE,MAAM,gBAAgB,GACpB,sBAAsB,GAAG,mBAAmB,CAAC,kBAAkB,CAAC,CAAC;oBACnE,IAAI,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,IAAI,IAAI,CAAC,GAAG,CAAC,iBAAiB,CAAC,EAAE,CAAC;wBAC9D,SAAS;oBACX,CAAC;gBACH,CAAC;gBACD,WAAW,IAAI,SAAS,CAAC;YAC3B,CAAC;YACD,OAAO,IAAI,WAAW,CAAC;QACzB,CAAC;aAAM,CAAC;YACN,OAAO,IAAI,IAAI,CAAC;QAClB,CAAC;IACH,CAAC;IACD,OAAO,OAAO,CAAC,OAAO,CAAC,CAAC;AAC1B,CAAC"}

1
index.js

@ -0,0 +1 @@
export { cleanText, filterText } from "./dist/cleantext.js";

38
package.json

@ -0,0 +1,38 @@
{
"name": "dtp-cleantext",
"version": "1.0.0",
"description": "Text filtering and safety to make garbage people very sad when they can't hack your node.",
"type": "module",
"main": "index.js",
"scripts": {
"build": "tsc",
"test": "echo \"Error: no test specified\" && exit 1"
},
"types": "./types/dtp-cleantext.d.ts",
"keywords": [
"xss",
"sanitize",
"filter",
"clean"
],
"author": {
"name": "Rob Colbert",
"email": "[email protected]",
"url": "https://digitaltelepresence.com/"
},
"license": "ISC",
"packageManager": "[email protected]",
"dependencies": {
"diacritics": "^1.3.0",
"shoetest": "^1.2.2",
"stats-lite": "^2.2.0",
"striptags": "^3.2.0"
},
"devDependencies": {
"@types/diacritics": "^1.3.3",
"@types/node": "^22.13.10",
"@types/stats-lite": "^2.2.2",
"ts-node": "^10.9.2",
"typescript": "^5.8.2"
}
}

248
pnpm-lock.yaml

@ -0,0 +1,248 @@
lockfileVersion: '9.0'
settings:
autoInstallPeers: true
excludeLinksFromLockfile: false
importers:
.:
dependencies:
diacritics:
specifier: ^1.3.0
version: 1.3.0
shoetest:
specifier: ^1.2.2
version: 1.2.2
stats-lite:
specifier: ^2.2.0
version: 2.2.0
striptags:
specifier: ^3.2.0
version: 3.2.0
devDependencies:
'@types/diacritics':
specifier: ^1.3.3
version: 1.3.3
'@types/node':
specifier: ^22.13.10
version: 22.13.10
'@types/stats-lite':
specifier: ^2.2.2
version: 2.2.2
ts-node:
specifier: ^10.9.2
version: 10.9.2(@types/[email protected])([email protected])
typescript:
specifier: ^5.8.2
version: 5.8.2
packages:
'@cspotcode/[email protected]':
resolution: {integrity: sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==}
engines: {node: '>=12'}
'@jridgewell/[email protected]':
resolution: {integrity: sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==}
engines: {node: '>=6.0.0'}
'@jridgewell/[email protected]':
resolution: {integrity: sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==}
'@jridgewell/[email protected]':
resolution: {integrity: sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==}
'@tsconfig/[email protected]':
resolution: {integrity: sha512-DcRjDCujK/kCk/cUe8Xz8ZSpm8mS3mNNpta+jGCA6USEDfktlNvm1+IuZ9eTcDbNk41BHwpHHeW+N1lKCz4zOw==}
'@tsconfig/[email protected]':
resolution: {integrity: sha512-cqefuRsh12pWyGsIoBKJA9luFu3mRxCA+ORZvA4ktLSzIuCUtWVxGIuXigEwO5/ywWFMZ2QEGKWvkZG1zDMTag==}
'@tsconfig/[email protected]':
resolution: {integrity: sha512-ysT8mhdixWK6Hw3i1V2AeRqZ5WfXg1G43mqoYlM2nc6388Fq5jcXyr5mRsqViLx/GJYdoL0bfXD8nmF+Zn/Iow==}
'@tsconfig/[email protected]':
resolution: {integrity: sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==}
'@types/[email protected]':
resolution: {integrity: sha512-wt0tBItmBsOUVZ8+MCrkBMoVfH/EUZeTXwYSekVVYilZlGDYssREUR+sX72mHvl2IrbdCKgpYARXKh3awD2how==}
'@types/[email protected]':
resolution: {integrity: sha512-I6LPUvlRH+O6VRUqYOcMudhaIdUVWfsjnZavnsraHvpBwaEyMN29ry+0UVJhImYL16xsscu0aske3yA+uPOWfw==}
'@types/[email protected]':
resolution: {integrity: sha512-T+bzT53cbPbE0hMlCNZux1QuH6hQFNHIwRMTQCu3YPG0W7XUfeoULHl+TehJCjaxQx8cz4wlg5oQsOyG9LvZmA==}
[email protected]:
resolution: {integrity: sha512-ueEepnujpqee2o5aIYnvHU6C0A42MNdsIDeqy5BydrkuC5R1ZuUFnm27EeFJGoEHJQgn3uleRvmTXaJgfXbt4g==}
engines: {node: '>=0.4.0'}
[email protected]:
resolution: {integrity: sha512-OvQ/2pUDKmgfCg++xsTX1wGxfTaszcHVcTctW4UJB4hibJx2HXxxO5UmVgyjMa+ZDsiaf5wWLXYpRWMmBI0QHg==}
engines: {node: '>=0.4.0'}
hasBin: true
[email protected]:
resolution: {integrity: sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==}
[email protected]:
resolution: {integrity: sha512-dcKFX3jn0MpIaXjisoRvexIJVEKzaq7z2rZKxf+MSr9TkdmHmsU4m2lcLojrj/FHl8mk5VxMmYA+ftRkP/3oKQ==}
[email protected]:
resolution: {integrity: sha512-wlwEkqcsaxvPJML+rDh/2iS824jbREk6DUMUKkEaSlxdYHeS43cClJtsWglvw2RfeXGm6ohKDqsXteJ5sP5enA==}
[email protected]:
resolution: {integrity: sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A==}
engines: {node: '>=0.3.1'}
[email protected]:
resolution: {integrity: sha512-pYxfDYpued//QpnLIm4Avk7rsNtAtQkUES2cwAYSvD/wd2pKD71gN2Ebj3e7klzXwjocvE8c5vx/1fxwpqmSxA==}
engines: {node: '>=4'}
[email protected]:
resolution: {integrity: sha512-JLiSz/zsZcGFXPrB4I/AGBvtStkt+8QmksyZBZnVXnnK9XdTEyz0tX8CRYljtwYDuIuZzih6DpHQdi+3Q6zHPw==}
[email protected]:
resolution: {integrity: sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw==}
[email protected]:
resolution: {integrity: sha512-U+5l2KrcMNOUPYvazA3h5ekF80FHTUG+87SEAmHZmolh1M+i/WyTCxVzmi+tidIa1tM4BSe8g2Y/D3loWDjj+w==}
engines: {node: '>=4'}
[email protected]:
resolution: {integrity: sha512-M0b3YWQs7R3Z917WRQy1HHA7Ba7D8hvZg6UE5mLykJxQVE2ju0IXbGlaHPPlkY+WN7wFP+wUMXmBFA0aV6vYGQ==}
engines: {node: '>=4'}
[email protected]:
resolution: {integrity: sha512-iT8kIEFcGfUwo53VUFckm+glTkc0oLycRe+YqU/W4wQuIHGIWc5KMIpDnJVdavKCyEZKQTi8IDq27rDmB09QjA==}
[email protected]:
resolution: {integrity: sha512-/Kz55rgUIv2KP2MKphwYT/NCuSfAlbbMRv2ZWw7wyXayu230zdtzhxxuXXcvsc6EmmhS8bSJl3uS1wmMHFumbA==}
engines: {node: '>=2.0.0'}
[email protected]:
resolution: {integrity: sha512-g45ZOGzHDMe2bdYMdIvdAfCQkCTDMGBazSw1ypMowwGIee7ZQ5dU0rBJ8Jqgl+jAKIv4dbeE1jscZq9wid1Tkw==}
[email protected]:
resolution: {integrity: sha512-f0FFpIdcHgn8zcPSbf1dRevwt047YMnaiJM3u2w2RewrB+fob/zePZcrOyQoLMMO7aBIddLcQIEK5dYjkLnGrQ==}
hasBin: true
peerDependencies:
'@swc/core': '>=1.2.50'
'@swc/wasm': '>=1.2.50'
'@types/node': '*'
typescript: '>=2.7'
peerDependenciesMeta:
'@swc/core':
optional: true
'@swc/wasm':
optional: true
[email protected]:
resolution: {integrity: sha512-aJn6wq13/afZp/jT9QZmwEjDqqvSGp1VT5GVg+f/t6/oVyrgXM6BY1h9BRh/O5p3PlUPAe+WuiEZOmb/49RqoQ==}
engines: {node: '>=14.17'}
hasBin: true
[email protected]:
resolution: {integrity: sha512-Ny6QZ2Nju20vw1SRHe3d9jVu6gJ+4e3+MMpqu7pqE5HT6WsTSlce++GQmK5UXS8mzV8DSYHrQH+Xrf2jVcuKNg==}
[email protected]:
resolution: {integrity: sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==}
[email protected]:
resolution: {integrity: sha512-Ux4ygGWsu2c7isFWe8Yu1YluJmqVhxqK2cLXNQA5AcC3QfbGNpM7fu0Y8b/z16pXLnFxZYvWhd3fhBY9DLmC6Q==}
engines: {node: '>=6'}
snapshots:
'@cspotcode/[email protected]':
dependencies:
'@jridgewell/trace-mapping': 0.3.9
'@jridgewell/[email protected]': {}
'@jridgewell/[email protected]': {}
'@jridgewell/[email protected]':
dependencies:
'@jridgewell/resolve-uri': 3.1.2
'@jridgewell/sourcemap-codec': 1.5.0
'@tsconfig/[email protected]': {}
'@tsconfig/[email protected]': {}
'@tsconfig/[email protected]': {}
'@tsconfig/[email protected]': {}
'@types/[email protected]': {}
'@types/[email protected]':
dependencies:
undici-types: 6.20.0
'@types/[email protected]': {}
[email protected]:
dependencies:
acorn: 8.14.1
[email protected]: {}
[email protected]: {}
[email protected]: {}
[email protected]: {}
[email protected]: {}
[email protected]: {}
[email protected]: {}
[email protected]: {}
[email protected]:
dependencies:
drange: 1.1.1
ret: 0.2.2
[email protected]: {}
[email protected]:
dependencies:
randexp: 0.5.3
[email protected]:
dependencies:
isnumber: 1.0.0
[email protected]: {}
[email protected](@types/[email protected])([email protected]):
dependencies:
'@cspotcode/source-map-support': 0.8.1
'@tsconfig/node10': 1.0.11
'@tsconfig/node12': 1.0.11
'@tsconfig/node14': 1.0.3
'@tsconfig/node16': 1.0.4
'@types/node': 22.13.10
acorn: 8.14.1
acorn-walk: 8.3.4
arg: 4.1.3
create-require: 1.1.1
diff: 4.0.2
make-error: 1.3.6
typescript: 5.8.2
v8-compile-cache-lib: 3.0.1
yn: 3.1.1
[email protected]: {}
[email protected]: {}
[email protected]: {}
[email protected]: {}

54
src/cleantext.ts

@ -0,0 +1,54 @@
// cleantext.ts
// Copyright (C) DTP Technologies, LLC
// License: Apache-2.0
import { createRequire } from "module";
const require = createRequire(import.meta.url); // jshint ignore:line
import WebTextFilter from "./lib/edit-with-vi.js";
import { clean } from "./unzalgo.js";
import striptags from "striptags";
import diacritics from "diacritics";
const shoetest = require("shoetest");
/**
* Basic text cleaning function to remove Zalgo and tags.
* @param text string The text to be cleaned
* @returns The cleaned text
*/
export function cleanText(text: string): string {
text = clean(text);
text = striptags(text.trim());
return text;
}
/**
* The heavy hammer of text filtering that removes all malicious and annoying
* things I know about as of this writing. Zalgo, tags, shoetest, diacritics,
* and our own custom nonsense UTF-8 and Unicode filters.
*
* This filter is very heavy-handed and merciless.
*
* @param text string The text to be filtered
* @returns The filtered text
*/
export function filterText(text: string): string {
if (!text || typeof text !== "string" || text.length < 1) {
return text;
}
text = WebTextFilter.filterNonsense(text);
text = WebTextFilter.filterGuff(text);
text = WebTextFilter.filterHtml(text);
text = shoetest.simplify(text);
text = diacritics.remove(text);
/*
* Once all the stupidity has been stripped, strip the HTML
* tags that might remain.
*/
return clean(text);
}

77
src/lib/edit-with-vi.ts

@ -0,0 +1,77 @@
// edit-with-vi.ts
// Copyright (C) 2025 DTP Technologies, LLC
// All Rights Reserved
// Based on:
// https://github.com/voidful/text-filtering-js/blob/master/text_filtering.js
// - Does not extend String because stop it.
// - CommonJS module
'use strict';
/*
* This file must only be edited with vi/vim. If you so much as *open* this file
* in VSCode, you've probably damaged the file. Do not save it. Just close it,
* and go edit the file with vi or vim.
*
* VS Code, being web-based, contains logic to filter out the content used to
* implement the filter. You will erase that content, and then various attackers
* will own your chat.
*
* If attackers have owned your chat, you may want to revert or otherwise restore
* this file to it's original state.
*/
export function filterBBcode (text: string) : string {
return text.replace(/\[.*\]/g, '');
}
export function filterLineBreak (text: string) : string {
return text.replace(/(\r\n|\n|\r)/gm," ");
}
export function filterSmileysCode (text: string) : string {
return text
.replace(/:\$?.*:\$?/g, '')
.replace(/:\w+:?/g, '')
.replace(/:\w+/g, '')
.replace(/&#.*;/g, '')
;
}
export function filterGuff (text: string) : string {
return text.replace('*** 作者被禁止或刪除 內容自動屏蔽 ***', '');
}
export function filterHtml (text: string) : string {
return text.replace(/(<[^>]*>)/g,' ');
}
export function filterNonsense (text: string) : string {
// edited to allow CR and LF
// text = text.replace(/[\u0000-\u001F\u007f\u00AD\u200B-\u200D\u3000\uFEFF]/g,'');
text = text.replace(/[\u0000-\u0009\u000b\u000c\u000e\u007f\u00AD\u200B-\u200D\u3000\uFEFF]/g,'');
text = text.replace(/\u00AD/,' ');
text = text.replace(/\u2013/,'-');
return text;
}
export function filterAll (text: string) : string {
text = module.exports.filterSmileysCode(text);
text = module.exports.filterBBcode(text);
text = module.exports.filterGuff(text);
text = module.exports.filterHtml(text);
text = module.exports.filterLineBreak(text);
return text;
}
export default {
filterBBcode,
filterLineBreak,
filterSmileysCode,
filterGuff,
filterHtml,
filterNonsense,
filterAll,
};

138
src/unzalgo.ts

@ -0,0 +1,138 @@
// unzalgo.ts
// Copyright (C) DTP Technologies, LLC
// License: Apache-2.0
"use strict";
import { percentile } from "stats-lite";
const categories = /[\p{Mn}\p{Me}]+/u;
const DEFAULT_DETECTION_THRESHOLD = 0.55;
const DEFAULT_TARGET_DENSITY = 0;
const compose = (text: string) => text.normalize("NFC");
const decompose = (text: string) => text.normalize("NFD");
const computeZalgoDensity = (text: string) =>
[...text].filter((character) => categories.test(character)).length /
Math.max(text.length, 1);
const clamp = (x: number) => Math.max(Math.min(x, 1), 0);
/**
* Computes a score [0, 1] for every word in the input string. Each score
* represents the ratio of combining characters to total characters in a word.
*
* @param text string The input string for which to compute scores.
* @return Array<number> An array of scores where each score describes the
* Zalgo ratio of a word.
*/
export function computeScores(text: string): Array<number> {
const wordScores: Array<number> = [];
/**
* Trimming here allows us to return early.
* Without trimming, we risk dividing by `0` later when computing the score.
*/
if (!text.trim().length) {
wordScores.push(0);
} else {
for (const word of decompose(text).split(/\s+/)) {
let banned = 0;
for (const character of word) {
if (categories.test(character)) {
++banned;
}
}
const score = banned / word.length;
wordScores.push(score);
}
}
return wordScores;
}
/**
* Determines if the string consists of Zalgo text. Note that the occurrence
* of a combining character is not enough to trigger the detection. Instead,
* it computes a ratio for the input string and checks if it exceeds a given
* threshold. Thus, internationalized strings aren't automatically classified
* as Zalgo text.
*
* @param text string A string for which a Zalgo text check is run.
* @param detectionThreshold number A threshold [0, 1]. The higher the
* threshold, the more combining characters are needed for it to be detected
* as Zalgo text.
* @return boolean Whether the string is a Zalgo text string.
*/
export function isZalgo(
text: string,
detectionThreshold: number = DEFAULT_DETECTION_THRESHOLD
): boolean {
const wordScores = computeScores(text);
const totalScore = percentile(wordScores, 0.75);
return totalScore >= clamp(detectionThreshold);
}
/**
* Removes all combining characters for every word in a string if the word is
* classified as Zalgo text.
*
* If `targetDensity` is specified, not all the Zalgo characters will be
* removed. Instead, they will be thinned out uniformly.
*
* @param text string
* A string for which combining characters are removed for every word whose
* Zalgo property is met.
* @param options object Options for cleaning.
* @param options.detectionThreshold number
* A threshold [0, 1]. The higher the threshold, the more combining
* characters are needed for it to be detected as Zalgo text.
* @param options.targetDensity number
* A threshold [0, 1]. The higher the density, the more Zalgo characters
* will be part of the resulting string. The result is guaranteed to have a
* Zalgo-character density that is less than or equal to the one provided.
* @return string
* A cleaned, more readable string.
*/
export function clean(
text: string,
{
detectionThreshold = DEFAULT_DETECTION_THRESHOLD,
targetDensity = DEFAULT_TARGET_DENSITY,
} = {}
) {
let cleaned = "";
const effectiveTargetDensity = clamp(targetDensity);
type CharacterRecord = {
character: string;
isCandidate: boolean;
};
for (const word of decompose(text).split(/(\s+)/)) {
if (isZalgo(word, detectionThreshold)) {
let cleanedWord = "";
const letters = [...word].map((character) => {
return {
character,
isCandidate: categories.test(character),
};
});
for (let i = 0; i < letters.length; ++i) {
const { character, isCandidate } = letters[i] as CharacterRecord;
if (isCandidate) {
const admissionProjection = cleanedWord + word.substring(i);
const omissionProjection = cleanedWord + word.substring(i + 1);
const admissionDistance =
effectiveTargetDensity - computeZalgoDensity(admissionProjection);
const omissionDistance =
effectiveTargetDensity - computeZalgoDensity(omissionProjection);
if (Math.abs(omissionDistance) <= Math.abs(admissionDistance)) {
continue;
}
}
cleanedWord += character;
}
cleaned += cleanedWord;
} else {
cleaned += word;
}
}
return compose(cleaned);
}

37
tsconfig.json

@ -0,0 +1,37 @@
{
"compilerOptions": {
"target": "ES2022",
"lib": ["es2022", "dom"],
"experimentalDecorators": true,
"emitDecoratorMetadata": true,
"module": "ESNext",
"rootDir": "./src",
"moduleResolution": "node",
"baseUrl": "./src",
"typeRoots": ["node_modules/@types", "types"],
"allowImportingTsExtensions": true,
"rewriteRelativeImportExtensions": true,
"resolveJsonModule": true,
"declaration": false,
"sourceMap": true,
"outDir": "dist",
"removeComments": false,
"importHelpers": true,
"sourceRoot": "./src",
"esModuleInterop": true,
"forceConsistentCasingInFileNames": true,
"strict": true,
"alwaysStrict": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"noImplicitReturns": true,
"noFallthroughCasesInSwitch": true,
"noUncheckedIndexedAccess": true,
"pretty": true,
"skipDefaultLibCheck": true,
"skipLibCheck": true
},
"include": ["src/**/*.ts"],
"exclude": ["node_modules", "docs"],
"files": ["types/dtp-cleantext.d.ts"]
}

4
types/dtp-cleantext.d.ts

@ -0,0 +1,4 @@
declare module "dtp-cleantext" {
export function cleanText(text: string): string;
export function filterText(text: string): string;
}
Loading…
Cancel
Save