dtp-base/app/services/text.js


								// text.js

								// Copyright (C) 2024 DTP Technologies, LLC

								// All Rights Reserved


								'use strict';


								import mongoose from 'mongoose';


								const User = mongoose.model('User');

								const Link = mongoose.model('Link');


								import striptags from 'striptags';

								import unzalgo from 'unzalgo';

								import shoetest from 'shoetest';

								import diacritics from 'diacritics';

								import DtpTextFilter from './lib/edit-with-vi.js';


								import { SiteService, SiteError } from '../../lib/site-lib.js';


								export default class TextService extends SiteService {


								  static get slug () { return 'text'; }

								  static get name ( ) { return 'TextService'; }


								  constructor (dtp) {

								    super(dtp, TextService);

								  }


								  async start ( ) {

								    const { jobQueue: jobQueueService } = this.dtp.services;

								    this.linksQueue = jobQueueService.getJobQueue('links', this.dtp.config.jobQueues.links);

								  }


								  /**

								   * Basic text cleaning function to remove Zalgo and tags.

								   * @param {String} text The text to be cleaned

								   * @returns The cleaned text

								   */

								  clean (text) {

								    text = unzalgo.clean(text);

								    text = striptags(text.trim());

								    return text;

								  }


								  /**

								   * The heavy hammer of text filtering that removes all malicious and annoying

								   * things I know about as of this writing. Zalgo, tags, shoetest, diacritics,

								   * and our own custom nonsense UTF-8 and Unicode filters.

								   *

								   * This filter is very heavy-handed and merciless.

								   *

								   * @param {String} text The text to be filtered

								   * @returns The filtered text

								   */

								  filter (text) {

								    if (!text || (typeof text !== 'string') || (text.length < 1)) {

								      return text;

								    }


								    text = DtpTextFilter.filterNonsense(text);

								    text = DtpTextFilter.filterGuff(text);

								    text = DtpTextFilter.filterHtml(text);


								    text = shoetest.simplify(text);

								    text = diacritics.remove(text);


								    /*

								     * Once all the stupidity has been stripped, strip the HTML

								     * tags that might remain.

								     */

								    return this.clean(text);

								  }


								  /**

								   * Scans input text for username mentions (`@username`) and resolves those

								   * names to an array of User IDs.

								   * @param {String} content The text content to be scanned for mentions

								   * @returns Array of user ID values for valid username(s) mentioned.

								   */

								  async findMentions (content) {

								    let usernames = content.match(/\B@[a-z0-9_-]+/gi);

								    if (!Array.isArray(usernames) || (usernames.length === 0)) {

								      return [ ];

								    }


								    /*

								     * Remove @, lowercase, and remove duplicates.

								     */

								    usernames = usernames

								      .map((username) => username.trim().slice(1).toLowerCase())

								      .filter((username, index, self) => { return self.indexOf(username) === index; });


								    this.log.debug('findMentions found usernames', { usernames });

								    const mentions = await User

								      .find({ username_lc: { $in: usernames } })

								      .select('_id')

								      .lean();


								    return mentions;

								  }


								  findHashtags (content) {

								    let tags = content.match(/\B\#[a-z0-9_-]+/gi);

								    if (!Array.isArray(tags) || (tags.length === 0)) {

								      return [ ];

								    }

								    tags = tags.map((tag) => tag.trim().slice(1).toLowerCase());

								    this.log.debug('hashtags extracted', { tags });

								    return tags;

								  }


								  /**

								   * Scans input text for links/URLs, performs some checks, and schedules them

								   * for ingest using a worker. The worker will emit socket.io messages to

								   * populate the UI with resolved link previews.

								   *

								   * Uses https://github.com/StevenBlack/hosts/tree/master/alternates/porn to

								   * eliminate blocked domains, which are stored in Redis.

								   *

								   * @param {User} author the author of the status being scanned

								   * @param {*} content the content of the status being scanned

								   * @returns array of links detected or an empty array

								   */

								  async findLinks (author, content, options) {

								    const NOW = new Date();

								    const { link: linkService } = this.dtp.services;


								    if (!author.permissions.canShareLinks) {

								      throw new SiteError(403, 'You are not permitted to share links in your posts.');

								    }


								    var urlRegex = /(((https?:\/\/)|(www\.))[^\s]+)/g;

								    const urls = content.match(urlRegex);

								    if (!Array.isArray(urls) || (urls.length === 0)) {

								      this.log.debug('post content contains no URLs/links');

								      return [ ];

								    }


								    const links = [ ];

								    for await (let url of urls) {

								      const domain = new URL(url).hostname.toLowerCase();

								      if (domain.endsWith('.cn')) {

								        throw new SiteError(403, 'Linking to Chinese websites is prohibited.');

								      }

								      if (domain.endsWith('.il')) {

								        throw new SiteError(403, 'Linking to websites in Israel is prohibited.');

								      }


								      if (await linkService.isDomainBlocked(domain)) {

								        this.log.alert('detected blocked domain in shared link', {

								          author: { _id: author._id, username: author.username },

								          domain, url,

								        });

								        throw new SiteError(403, `All links/URLs pointing to ${domain} are prohibited.`);

								      }


								      /*

								       * An upsert is used to create a document if one doesn't exist. The domain

								       * and url are set on insert, and lastShared is always set so it will be

								       * current.

								       *

								       * submittedBy is an array that holds the User._id of each member that

								       * submitted the link. This enables their Link History view, which becomes

								       * it's own feed.

								       */

								      const link = await Link.findOneAndUpdate(

								        { domain, url },

								        {

								          $setOnInsert: {

								            created: NOW,

								            domain, url,

								          },

								          $addToSet: { submittedBy: author._id },

								          $set: { lastShared: NOW },

								        },

								        { upsert: true, new: true },

								      );


								      /*

								       * link is now the document from MongoDB and will contain additional

								       * information about the link, or not. If not, create a job to fetch link

								       * preview data, and to scan the link for malicious intent (unless we know

								       * the link has been administratively blocked).

								       */

								      this.linksQueue.add('link-ingest', {

								        submitterId: author._id,

								        linkId: link._id,

								        options,

								      });


								      this.log.debug('adding detected link', { domain, url, link: link._id });

								      links.push(link._id);

								    }


								    return links;

								  }

								}