You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
117 lines
4.7 KiB
117 lines
4.7 KiB
// unzalgo.ts
|
|
// Copyright (C) DTP Technologies, LLC
|
|
// License: Apache-2.0
|
|
"use strict";
|
|
import { percentile } from "stats-lite";
|
|
const categories = /[\p{Mn}\p{Me}]+/u;
|
|
const DEFAULT_DETECTION_THRESHOLD = 0.55;
|
|
const DEFAULT_TARGET_DENSITY = 0;
|
|
const compose = (text) => text.normalize("NFC");
|
|
const decompose = (text) => text.normalize("NFD");
|
|
const computeZalgoDensity = (text) => [...text].filter((character) => categories.test(character)).length /
|
|
Math.max(text.length, 1);
|
|
const clamp = (x) => Math.max(Math.min(x, 1), 0);
|
|
/**
|
|
* Computes a score ∈ [0, 1] for every word in the input string. Each score
|
|
* represents the ratio of combining characters to total characters in a word.
|
|
*
|
|
* @param text string The input string for which to compute scores.
|
|
* @return Array<number> An array of scores where each score describes the
|
|
* Zalgo ratio of a word.
|
|
*/
|
|
export function computeScores(text) {
|
|
const wordScores = [];
|
|
/**
|
|
* Trimming here allows us to return early.
|
|
* Without trimming, we risk dividing by `0` later when computing the score.
|
|
*/
|
|
if (!text.trim().length) {
|
|
wordScores.push(0);
|
|
}
|
|
else {
|
|
for (const word of decompose(text).split(/\s+/)) {
|
|
let banned = 0;
|
|
for (const character of word) {
|
|
if (categories.test(character)) {
|
|
++banned;
|
|
}
|
|
}
|
|
const score = banned / word.length;
|
|
wordScores.push(score);
|
|
}
|
|
}
|
|
return wordScores;
|
|
}
|
|
/**
|
|
* Determines if the string consists of Zalgo text. Note that the occurrence
|
|
* of a combining character is not enough to trigger the detection. Instead,
|
|
* it computes a ratio for the input string and checks if it exceeds a given
|
|
* threshold. Thus, internationalized strings aren't automatically classified
|
|
* as Zalgo text.
|
|
*
|
|
* @param text string A string for which a Zalgo text check is run.
|
|
* @param detectionThreshold number A threshold ∈ [0, 1]. The higher the
|
|
* threshold, the more combining characters are needed for it to be detected
|
|
* as Zalgo text.
|
|
* @return boolean Whether the string is a Zalgo text string.
|
|
*/
|
|
export function isZalgo(text, detectionThreshold = DEFAULT_DETECTION_THRESHOLD) {
|
|
const wordScores = computeScores(text);
|
|
const totalScore = percentile(wordScores, 0.75);
|
|
return totalScore >= clamp(detectionThreshold);
|
|
}
|
|
/**
|
|
* Removes all combining characters for every word in a string if the word is
|
|
* classified as Zalgo text.
|
|
*
|
|
* If `targetDensity` is specified, not all the Zalgo characters will be
|
|
* removed. Instead, they will be thinned out uniformly.
|
|
*
|
|
* @param text string
|
|
* A string for which combining characters are removed for every word whose
|
|
* Zalgo property is met.
|
|
* @param options object Options for cleaning.
|
|
* @param options.detectionThreshold number
|
|
* A threshold ∈ [0, 1]. The higher the threshold, the more combining
|
|
* characters are needed for it to be detected as Zalgo text.
|
|
* @param options.targetDensity number
|
|
* A threshold ∈ [0, 1]. The higher the density, the more Zalgo characters
|
|
* will be part of the resulting string. The result is guaranteed to have a
|
|
* Zalgo-character density that is less than or equal to the one provided.
|
|
* @return string
|
|
* A cleaned, more readable string.
|
|
*/
|
|
export function clean(text, { detectionThreshold = DEFAULT_DETECTION_THRESHOLD, targetDensity = DEFAULT_TARGET_DENSITY, } = {}) {
|
|
let cleaned = "";
|
|
const effectiveTargetDensity = clamp(targetDensity);
|
|
for (const word of decompose(text).split(/(\s+)/)) {
|
|
if (isZalgo(word, detectionThreshold)) {
|
|
let cleanedWord = "";
|
|
const letters = [...word].map((character) => {
|
|
return {
|
|
character,
|
|
isCandidate: categories.test(character),
|
|
};
|
|
});
|
|
for (let i = 0; i < letters.length; ++i) {
|
|
const { character, isCandidate } = letters[i];
|
|
if (isCandidate) {
|
|
const admissionProjection = cleanedWord + word.substring(i);
|
|
const omissionProjection = cleanedWord + word.substring(i + 1);
|
|
const admissionDistance = effectiveTargetDensity - computeZalgoDensity(admissionProjection);
|
|
const omissionDistance = effectiveTargetDensity - computeZalgoDensity(omissionProjection);
|
|
if (Math.abs(omissionDistance) <= Math.abs(admissionDistance)) {
|
|
continue;
|
|
}
|
|
}
|
|
cleanedWord += character;
|
|
}
|
|
cleaned += cleanedWord;
|
|
}
|
|
else {
|
|
cleaned += word;
|
|
}
|
|
}
|
|
return compose(cleaned);
|
|
}
|
|
//# sourceMappingURL=unzalgo.js.map
|