// text.js // Copyright (C) 2024 DTP Technologies, LLC // All Rights Reserved 'use strict'; import mongoose from 'mongoose'; const User = mongoose.model('User'); const Link = mongoose.model('Link'); import striptags from 'striptags'; import shoetest from 'shoetest'; import diacritics from 'diacritics'; import DtpTextFilter from './lib/edit-with-vi.js'; import { SiteService, SiteError, SiteUnzalgo, } from '../../lib/site-lib.js'; export default class TextService extends SiteService { static get slug () { return 'text'; } static get name ( ) { return 'TextService'; } constructor (dtp) { super(dtp, TextService); } async start ( ) { const { jobQueue: jobQueueService } = this.dtp.services; this.linksQueue = jobQueueService.getJobQueue('links', this.dtp.config.jobQueues.links); } /** * Basic text cleaning function to remove Zalgo and tags. * @param {String} text The text to be cleaned * @returns The cleaned text */ clean (text) { text = SiteUnzalgo.clean(text); text = striptags(text.trim()); return text; } /** * The heavy hammer of text filtering that removes all malicious and annoying * things I know about as of this writing. Zalgo, tags, shoetest, diacritics, * and our own custom nonsense UTF-8 and Unicode filters. * * This filter is very heavy-handed and merciless. * * @param {String} text The text to be filtered * @returns The filtered text */ filter (text) { if (!text || (typeof text !== 'string') || (text.length < 1)) { return text; } text = DtpTextFilter.filterNonsense(text); text = DtpTextFilter.filterGuff(text); text = DtpTextFilter.filterHtml(text); text = shoetest.simplify(text); text = diacritics.remove(text); /* * Once all the stupidity has been stripped, strip the HTML * tags that might remain. */ return this.clean(text); } /** * Scans input text for username mentions (`@username`) and resolves those * names to an array of User IDs. * @param {String} content The text content to be scanned for mentions * @returns Array of user ID values for valid username(s) mentioned. */ async findMentions (content) { let usernames = content.match(/\B@[a-z0-9_-]+/gi); if (!Array.isArray(usernames) || (usernames.length === 0)) { return [ ]; } /* * Remove @, lowercase, and remove duplicates. */ usernames = usernames .map((username) => username.trim().slice(1).toLowerCase()) .filter((username, index, self) => { return self.indexOf(username) === index; }); this.log.debug('findMentions found usernames', { usernames }); const mentions = await User .find({ username_lc: { $in: usernames } }) .select('_id') .lean(); return mentions; } findHashtags (content) { let tags = content.match(/\B\#[a-z0-9_-]+/gi); if (!Array.isArray(tags) || (tags.length === 0)) { return [ ]; } tags = tags.map((tag) => tag.trim().slice(1).toLowerCase()); this.log.debug('hashtags extracted', { tags }); return tags; } /** * Scans input text for links/URLs, performs some checks, and schedules them * for ingest using a worker. The worker will emit socket.io messages to * populate the UI with resolved link previews. * * Uses https://github.com/StevenBlack/hosts/tree/master/alternates/porn to * eliminate blocked domains, which are stored in Redis. * * @param {User} author the author of the status being scanned * @param {*} content the content of the status being scanned * @returns array of links detected or an empty array */ async findLinks (author, content, options) { const NOW = new Date(); const { link: linkService } = this.dtp.services; if (!author.permissions.canShareLinks) { throw new SiteError(403, 'You are not permitted to share links in your posts.'); } var urlRegex = /(((https?:\/\/)|(www\.))[^\s]+)/g; const urls = content.match(urlRegex); if (!Array.isArray(urls) || (urls.length === 0)) { this.log.debug('post content contains no URLs/links'); return [ ]; } const links = [ ]; for await (let url of urls) { const domain = new URL(url).hostname.toLowerCase(); if (domain.endsWith('.cn')) { throw new SiteError(403, 'Linking to Chinese websites is prohibited.'); } if (domain.endsWith('.il')) { throw new SiteError(403, 'Linking to websites in Israel is prohibited.'); } if (await linkService.isDomainBlocked(domain)) { this.log.alert('detected blocked domain in shared link', { author: { _id: author._id, username: author.username }, domain, url, }); throw new SiteError(403, `All links/URLs pointing to ${domain} are prohibited.`); } /* * An upsert is used to create a document if one doesn't exist. The domain * and url are set on insert, and lastShared is always set so it will be * current. * * submittedBy is an array that holds the User._id of each member that * submitted the link. This enables their Link History view, which becomes * it's own feed. */ const link = await Link.findOneAndUpdate( { domain, url }, { $setOnInsert: { created: NOW, domain, url, }, $addToSet: { submittedBy: author._id }, $set: { lastShared: NOW }, }, { upsert: true, new: true }, ); /* * link is now the document from MongoDB and will contain additional * information about the link, or not. If not, create a job to fetch link * preview data, and to scan the link for malicious intent (unless we know * the link has been administratively blocked). */ this.linksQueue.add('link-ingest', { submitterId: author._id, linkId: link._id, options, }); this.log.debug('adding detected link', { domain, url, link: link._id }); links.push(link._id); } return links; } }