You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
197 lines
6.0 KiB
197 lines
6.0 KiB
// text.js
|
|
// Copyright (C) 2024 DTP Technologies, LLC
|
|
// All Rights Reserved
|
|
|
|
'use strict';
|
|
|
|
import mongoose from 'mongoose';
|
|
|
|
const User = mongoose.model('User');
|
|
const Link = mongoose.model('Link');
|
|
|
|
import striptags from 'striptags';
|
|
import unzalgo from 'unzalgo';
|
|
import shoetest from 'shoetest';
|
|
import diacritics from 'diacritics';
|
|
import DtpTextFilter from './lib/edit-with-vi.js';
|
|
|
|
import { SiteService, SiteError } from '../../lib/site-lib.js';
|
|
|
|
export default class TextService extends SiteService {
|
|
|
|
static get slug () { return 'text'; }
|
|
static get name ( ) { return 'TextService'; }
|
|
|
|
constructor (dtp) {
|
|
super(dtp, TextService);
|
|
}
|
|
|
|
async start ( ) {
|
|
const { jobQueue: jobQueueService } = this.dtp.services;
|
|
this.linksQueue = jobQueueService.getJobQueue('links', this.dtp.config.jobQueues.links);
|
|
}
|
|
|
|
/**
|
|
* Basic text cleaning function to remove Zalgo and tags.
|
|
* @param {String} text The text to be cleaned
|
|
* @returns The cleaned text
|
|
*/
|
|
clean (text) {
|
|
text = unzalgo.clean(text);
|
|
text = striptags(text.trim());
|
|
return text;
|
|
}
|
|
|
|
/**
|
|
* The heavy hammer of text filtering that removes all malicious and annoying
|
|
* things I know about as of this writing. Zalgo, tags, shoetest, diacritics,
|
|
* and our own custom nonsense UTF-8 and Unicode filters.
|
|
*
|
|
* This filter is very heavy-handed and merciless.
|
|
*
|
|
* @param {String} text The text to be filtered
|
|
* @returns The filtered text
|
|
*/
|
|
filter (text) {
|
|
if (!text || (typeof text !== 'string') || (text.length < 1)) {
|
|
return text;
|
|
}
|
|
|
|
text = DtpTextFilter.filterNonsense(text);
|
|
text = DtpTextFilter.filterGuff(text);
|
|
text = DtpTextFilter.filterHtml(text);
|
|
|
|
text = shoetest.simplify(text);
|
|
text = diacritics.remove(text);
|
|
|
|
/*
|
|
* Once all the stupidity has been stripped, strip the HTML
|
|
* tags that might remain.
|
|
*/
|
|
return this.clean(text);
|
|
}
|
|
|
|
/**
|
|
* Scans input text for username mentions (`@username`) and resolves those
|
|
* names to an array of User IDs.
|
|
* @param {String} content The text content to be scanned for mentions
|
|
* @returns Array of user ID values for valid username(s) mentioned.
|
|
*/
|
|
async findMentions (content) {
|
|
let usernames = content.match(/\B@[a-z0-9_-]+/gi);
|
|
if (!Array.isArray(usernames) || (usernames.length === 0)) {
|
|
return [ ];
|
|
}
|
|
|
|
/*
|
|
* Remove @, lowercase, and remove duplicates.
|
|
*/
|
|
usernames = usernames
|
|
.map((username) => username.trim().slice(1).toLowerCase())
|
|
.filter((username, index, self) => { return self.indexOf(username) === index; });
|
|
|
|
this.log.debug('findMentions found usernames', { usernames });
|
|
const mentions = await User
|
|
.find({ username_lc: { $in: usernames } })
|
|
.select('_id')
|
|
.lean();
|
|
|
|
return mentions;
|
|
}
|
|
|
|
findHashtags (content) {
|
|
let tags = content.match(/\B\#[a-z0-9_-]+/gi);
|
|
if (!Array.isArray(tags) || (tags.length === 0)) {
|
|
return [ ];
|
|
}
|
|
tags = tags.map((tag) => tag.trim().slice(1).toLowerCase());
|
|
this.log.debug('hashtags extracted', { tags });
|
|
return tags;
|
|
}
|
|
|
|
/**
|
|
* Scans input text for links/URLs, performs some checks, and schedules them
|
|
* for ingest using a worker. The worker will emit socket.io messages to
|
|
* populate the UI with resolved link previews.
|
|
*
|
|
* Uses https://github.com/StevenBlack/hosts/tree/master/alternates/porn to
|
|
* eliminate blocked domains, which are stored in Redis.
|
|
*
|
|
* @param {User} author the author of the status being scanned
|
|
* @param {*} content the content of the status being scanned
|
|
* @returns array of links detected or an empty array
|
|
*/
|
|
async findLinks (author, content, options) {
|
|
const NOW = new Date();
|
|
const { link: linkService } = this.dtp.services;
|
|
|
|
if (!author.permissions.canShareLinks) {
|
|
throw new SiteError(403, 'You are not permitted to share links in your posts.');
|
|
}
|
|
|
|
var urlRegex = /(((https?:\/\/)|(www\.))[^\s]+)/g;
|
|
const urls = content.match(urlRegex);
|
|
if (!Array.isArray(urls) || (urls.length === 0)) {
|
|
this.log.debug('post content contains no URLs/links');
|
|
return [ ];
|
|
}
|
|
|
|
const links = [ ];
|
|
for await (let url of urls) {
|
|
const domain = new URL(url).hostname.toLowerCase();
|
|
if (domain.endsWith('.cn')) {
|
|
throw new SiteError(403, 'Linking to Chinese websites is prohibited.');
|
|
}
|
|
if (domain.endsWith('.il')) {
|
|
throw new SiteError(403, 'Linking to websites in Israel is prohibited.');
|
|
}
|
|
|
|
if (await linkService.isDomainBlocked(domain)) {
|
|
this.log.alert('detected blocked domain in shared link', {
|
|
author: { _id: author._id, username: author.username },
|
|
domain, url,
|
|
});
|
|
throw new SiteError(403, `All links/URLs pointing to ${domain} are prohibited.`);
|
|
}
|
|
|
|
/*
|
|
* An upsert is used to create a document if one doesn't exist. The domain
|
|
* and url are set on insert, and lastShared is always set so it will be
|
|
* current.
|
|
*
|
|
* submittedBy is an array that holds the User._id of each member that
|
|
* submitted the link. This enables their Link History view, which becomes
|
|
* it's own feed.
|
|
*/
|
|
const link = await Link.findOneAndUpdate(
|
|
{ domain, url },
|
|
{
|
|
$setOnInsert: {
|
|
created: NOW,
|
|
domain, url,
|
|
},
|
|
$addToSet: { submittedBy: author._id },
|
|
$set: { lastShared: NOW },
|
|
},
|
|
{ upsert: true, new: true },
|
|
);
|
|
|
|
/*
|
|
* link is now the document from MongoDB and will contain additional
|
|
* information about the link, or not. If not, create a job to fetch link
|
|
* preview data, and to scan the link for malicious intent (unless we know
|
|
* the link has been administratively blocked).
|
|
*/
|
|
this.linksQueue.add('link-ingest', {
|
|
submitterId: author._id,
|
|
linkId: link._id,
|
|
options,
|
|
});
|
|
|
|
this.log.debug('adding detected link', { domain, url, link: link._id });
|
|
links.push(link._id);
|
|
}
|
|
|
|
return links;
|
|
}
|
|
}
|