DTP Base provides a scalable and secure Node.js application development harness ready for production service.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

119 lines
4.4 KiB

// site-unzalgo.js
// Copyright (C) 2024 DTP Technologies, LLC
// All Rights Reserved
'use strict';
import { percentile } from "stats-lite";
const categories = /[\p{Mn}\p{Me}]+/u;
const DEFAULT_DETECTION_THRESHOLD = 0.55;
const DEFAULT_TARGET_DENSITY = 0;
const compose = string => string.normalize("NFC");
const decompose = string => string.normalize("NFD");
const computeZalgoDensity = string => [...string].filter(character => categories.test(character)).length / Math.max(string.length, 1);
const clamp = x => Math.max(Math.min(x, 1), 0);
export class SiteUnzalgo {
/**
* Computes a score ∈ [0, 1] for every word in the input string. Each score represents the ratio of combining characters to total characters in a word.
* @param {string} string
* The input string for which to compute scores.
* @return {number[]}
* An array of scores where each score describes the Zalgo ratio of a word.
*/
static computeScores (string) {
const wordScores = [];
/**
* Trimming here allows us to return early.
* Without trimming, we risk dividing by `0` later when computing the score.
*/
if (!string.trim().length) {
wordScores.push(0);
}
else {
for (const word of decompose(string).split(/\s+/)) {
let banned = 0;
for (const character of word) {
if (categories.test(character)) {
++banned;
}
}
const score = banned / word.length;
wordScores.push(score);
}
}
return wordScores;
}
/**
* Determines if the string consists of Zalgo text. Note that the occurrence of a combining character is not enough to trigger the detection. Instead, it computes a ratio for the input string and checks if it exceeds a given threshold. Thus, internationalized strings aren't automatically classified as Zalgo text.
* @param {string} string
* A string for which a Zalgo text check is run.
* @param {number} detectionThreshold
* A threshold ∈ [0, 1]. The higher the threshold, the more combining characters are needed for it to be detected as Zalgo text.
* @return {boolean}
* Whether the string is a Zalgo text string.
*/
static isZalgo (string, detectionThreshold = DEFAULT_DETECTION_THRESHOLD) {
const wordScores = SiteUnzalgo.computeScores(string);
const totalScore = percentile(wordScores, 0.75);
return totalScore >= clamp(detectionThreshold);
}
/**
* Removes all combining characters for every word in a string if the word is classified as Zalgo text.
* If `targetDensity` is specified, not all the Zalgo characters will be removed. Instead, they will be thinned out uniformly.
* @param {string} string
* A string for which combining characters are removed for every word whose Zalgo property is met.
* @param {object} options
* Options for cleaning.
* @param {number} [options.detectionThreshold=DEFAULT_DETECTION_THRESHOLD]
* A threshold ∈ [0, 1]. The higher the threshold, the more combining characters are needed for it to be detected as Zalgo text.
* @param {number} [options.targetDensity=DEFAULT_TARGET_DENSITY]
* A threshold ∈ [0, 1]. The higher the density, the more Zalgo characters will be part of the resulting string. The result is guaranteed to have a Zalgo-character density that is less than or equal to the one provided.
* @return {string}
* A cleaned, more readable string.
*/
static clean (
string,
{
detectionThreshold = DEFAULT_DETECTION_THRESHOLD,
targetDensity = DEFAULT_TARGET_DENSITY
} = { },
) {
let cleaned = "";
const effectiveTargetDensity = clamp(targetDensity);
for (const word of decompose(string).split(/(\s+)/)) {
if (SiteUnzalgo.isZalgo(word, detectionThreshold)) {
let cleanedWord = "";
const letters = [...word].map(character => ({
character,
isCandidate: categories.test(character)
}));
for (let i = 0; i < letters.length; ++i) {
const {
character,
isCandidate
} = letters[i];
if (isCandidate) {
const admissionProjection = cleanedWord + word.substr(i);
const omissionProjection = cleanedWord + word.substr(i + 1);
const admissionDistance = effectiveTargetDensity - computeZalgoDensity(admissionProjection);
const omissionDistance = effectiveTargetDensity - computeZalgoDensity(omissionProjection);
if (Math.abs(omissionDistance) <= Math.abs(admissionDistance)) {
continue;
}
}
cleanedWord += character;
}
cleaned += cleanedWord;
}
else {
cleaned += word;
}
}
return compose(cleaned);
}
}