DTP Base provides a scalable and secure Node.js application development harness ready for production service.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

197 lines
6.0 KiB

// text.js
// Copyright (C) 2024 DTP Technologies, LLC
// All Rights Reserved
'use strict';
import mongoose from 'mongoose';
const User = mongoose.model('User');
const Link = mongoose.model('Link');
import striptags from 'striptags';
import unzalgo from 'unzalgo';
import shoetest from 'shoetest';
import diacritics from 'diacritics';
import DtpTextFilter from './lib/edit-with-vi.js';
import { SiteService, SiteError } from '../../lib/site-lib.js';
export default class TextService extends SiteService {
static get slug () { return 'text'; }
static get name ( ) { return 'TextService'; }
constructor (dtp) {
super(dtp, TextService);
}
async start ( ) {
const { jobQueue: jobQueueService } = this.dtp.services;
this.linksQueue = jobQueueService.getJobQueue('links', this.dtp.config.jobQueues.links);
}
/**
* Basic text cleaning function to remove Zalgo and tags.
* @param {String} text The text to be cleaned
* @returns The cleaned text
*/
clean (text) {
text = unzalgo.clean(text);
text = striptags(text.trim());
return text;
}
/**
* The heavy hammer of text filtering that removes all malicious and annoying
* things I know about as of this writing. Zalgo, tags, shoetest, diacritics,
* and our own custom nonsense UTF-8 and Unicode filters.
*
* This filter is very heavy-handed and merciless.
*
* @param {String} text The text to be filtered
* @returns The filtered text
*/
filter (text) {
if (!text || (typeof text !== 'string') || (text.length < 1)) {
return text;
}
text = DtpTextFilter.filterNonsense(text);
text = DtpTextFilter.filterGuff(text);
text = DtpTextFilter.filterHtml(text);
text = shoetest.simplify(text);
text = diacritics.remove(text);
/*
* Once all the stupidity has been stripped, strip the HTML
* tags that might remain.
*/
return this.clean(text);
}
/**
* Scans input text for username mentions (`@username`) and resolves those
* names to an array of User IDs.
* @param {String} content The text content to be scanned for mentions
* @returns Array of user ID values for valid username(s) mentioned.
*/
async findMentions (content) {
let usernames = content.match(/\B@[a-z0-9_-]+/gi);
if (!Array.isArray(usernames) || (usernames.length === 0)) {
return [ ];
}
/*
* Remove @, lowercase, and remove duplicates.
*/
usernames = usernames
.map((username) => username.trim().slice(1).toLowerCase())
.filter((username, index, self) => { return self.indexOf(username) === index; });
this.log.debug('findMentions found usernames', { usernames });
const mentions = await User
.find({ username_lc: { $in: usernames } })
.select('_id')
.lean();
return mentions;
}
findHashtags (content) {
let tags = content.match(/\B\#[a-z0-9_-]+/gi);
if (!Array.isArray(tags) || (tags.length === 0)) {
return [ ];
}
tags = tags.map((tag) => tag.trim().slice(1).toLowerCase());
this.log.debug('hashtags extracted', { tags });
return tags;
}
/**
* Scans input text for links/URLs, performs some checks, and schedules them
* for ingest using a worker. The worker will emit socket.io messages to
* populate the UI with resolved link previews.
*
* Uses https://github.com/StevenBlack/hosts/tree/master/alternates/porn to
* eliminate blocked domains, which are stored in Redis.
*
* @param {User} author the author of the status being scanned
* @param {*} content the content of the status being scanned
* @returns array of links detected or an empty array
*/
async findLinks (author, content, options) {
const NOW = new Date();
const { link: linkService } = this.dtp.services;
if (!author.permissions.canShareLinks) {
throw new SiteError(403, 'You are not permitted to share links in your posts.');
}
var urlRegex = /(((https?:\/\/)|(www\.))[^\s]+)/g;
const urls = content.match(urlRegex);
if (!Array.isArray(urls) || (urls.length === 0)) {
this.log.debug('post content contains no URLs/links');
return [ ];
}
const links = [ ];
for await (let url of urls) {
const domain = new URL(url).hostname.toLowerCase();
if (domain.endsWith('.cn')) {
throw new SiteError(403, 'Linking to Chinese websites is prohibited.');
}
if (domain.endsWith('.il')) {
throw new SiteError(403, 'Linking to websites in Israel is prohibited.');
}
if (await linkService.isDomainBlocked(domain)) {
this.log.alert('detected blocked domain in shared link', {
author: { _id: author._id, username: author.username },
domain, url,
});
throw new SiteError(403, `All links/URLs pointing to ${domain} are prohibited.`);
}
/*
* An upsert is used to create a document if one doesn't exist. The domain
* and url are set on insert, and lastShared is always set so it will be
* current.
*
* submittedBy is an array that holds the User._id of each member that
* submitted the link. This enables their Link History view, which becomes
* it's own feed.
*/
const link = await Link.findOneAndUpdate(
{ domain, url },
{
$setOnInsert: {
created: NOW,
domain, url,
},
$addToSet: { submittedBy: author._id },
$set: { lastShared: NOW },
},
{ upsert: true, new: true },
);
/*
* link is now the document from MongoDB and will contain additional
* information about the link, or not. If not, create a job to fetch link
* preview data, and to scan the link for malicious intent (unless we know
* the link has been administratively blocked).
*/
this.linksQueue.add('link-ingest', {
submitterId: author._id,
linkId: link._id,
options,
});
this.log.debug('adding detected link', { domain, url, link: link._id });
links.push(link._id);
}
return links;
}
}