LCOV - code coverage report
Current view: top level - lib/src/utils - html_to_text.dart (source / functions) Coverage Total Hit
Test: merged.info Lines: 98.5 % 131 129
Test Date: 2025-01-14 13:39:53 Functions: - 0 0

            Line data    Source code
       1              : /*
       2              :  *   Famedly Matrix SDK
       3              :  *   Copyright (C) 2021 Famedly GmbH
       4              :  *
       5              :  *   This program is free software: you can redistribute it and/or modify
       6              :  *   it under the terms of the GNU Affero General Public License as
       7              :  *   published by the Free Software Foundation, either version 3 of the
       8              :  *   License, or (at your option) any later version.
       9              :  *
      10              :  *   This program is distributed in the hope that it will be useful,
      11              :  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
      12              :  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
      13              :  *   GNU Affero General Public License for more details.
      14              :  *
      15              :  *   You should have received a copy of the GNU Affero General Public License
      16              :  *   along with this program.  If not, see <https://www.gnu.org/licenses/>.
      17              :  */
      18              : 
      19              : import 'package:collection/collection.dart';
      20              : import 'package:html/dom.dart';
      21              : import 'package:html/parser.dart';
      22              : import 'package:html_unescape/html_unescape.dart';
      23              : 
      24              : class HtmlToText {
      25              :   /// Convert an HTML string to a pseudo-markdown plain text representation, with
      26              :   /// `data-mx-spoiler` spans redacted
      27            4 :   static String convert(String html) {
      28              :     // riot-web is notorious for creating bad reply fallback events from invalid messages which, if
      29              :     // not handled properly, can lead to impersonation. As such, we strip the entire `<mx-reply>` tags
      30              :     // here already, to prevent that from happening.
      31              :     // We do *not* do this in an AST and just with simple regex here, as riot-web tends to create
      32              :     // miss-matching tags, and this way we actually correctly identify what we want to strip and, well,
      33              :     // strip it.
      34            4 :     final renderHtml = html.replaceAll(
      35            4 :       RegExp(
      36              :         '<mx-reply>.*</mx-reply>',
      37              :         caseSensitive: false,
      38              :         multiLine: false,
      39              :         dotAll: true,
      40              :       ),
      41              :       '',
      42              :     );
      43              : 
      44            4 :     final opts = _ConvertOpts();
      45            8 :     var reply = _walkNode(opts, parseFragment(renderHtml));
      46            8 :     reply = reply.replaceAll(RegExp(r'\s*$', multiLine: false), '');
      47              :     return reply;
      48              :   }
      49              : 
      50            2 :   static String _parsePreContent(_ConvertOpts opts, Element node) {
      51            2 :     var text = node.innerHtml;
      52              :     final match =
      53            2 :         RegExp(r'^<code([^>]*)>', multiLine: false, caseSensitive: false)
      54            2 :             .firstMatch(text);
      55              :     if (match == null) {
      56            4 :       text = HtmlUnescape().convert(text);
      57            2 :       if (text.isNotEmpty) {
      58            4 :         if (text[0] != '\n') {
      59            2 :           text = '\n$text';
      60              :         }
      61            8 :         if (text[text.length - 1] != '\n') {
      62            2 :           text += '\n';
      63              :         }
      64              :       }
      65              :       return text;
      66              :     }
      67              :     // remove <code> opening tag
      68            4 :     text = text.substring(match.end);
      69              :     // remove the </code> closing tag
      70            2 :     text = text.replaceAll(
      71            2 :       RegExp(r'</code>$', multiLine: false, caseSensitive: false),
      72              :       '',
      73              :     );
      74            4 :     text = HtmlUnescape().convert(text);
      75            2 :     if (text.isNotEmpty) {
      76            4 :       if (text[0] != '\n') {
      77            2 :         text = '\n$text';
      78              :       }
      79            8 :       if (text[text.length - 1] != '\n') {
      80            2 :         text += '\n';
      81              :       }
      82              :     }
      83              :     final language =
      84            2 :         RegExp(r'language-(\w+)', multiLine: false, caseSensitive: false)
      85            4 :             .firstMatch(match.group(1)!);
      86              :     if (language != null) {
      87            4 :       text = language.group(1)! + text;
      88              :     }
      89              :     return text;
      90              :   }
      91              : 
      92            2 :   static String _parseBlockquoteContent(_ConvertOpts opts, Element node) {
      93            2 :     final msg = _walkChildNodes(opts, node);
      94           12 :     return '${msg.split('\n').map((s) => '> $s').join('\n')}\n';
      95              :   }
      96              : 
      97            2 :   static String _parseSpanContent(_ConvertOpts opts, Element node) {
      98            2 :     final content = _walkChildNodes(opts, node);
      99            6 :     if (node.attributes['data-mx-spoiler'] is String) {
     100            4 :       var spoiler = '█' * content.length;
     101            4 :       final reason = node.attributes['data-mx-spoiler'];
     102            2 :       if (reason != '') {
     103            2 :         spoiler = '($reason) $spoiler';
     104              :       }
     105              :       return spoiler;
     106              :     }
     107              :     return content;
     108              :   }
     109              : 
     110            2 :   static String _parseUlContent(_ConvertOpts opts, Element node) {
     111            4 :     opts.listDepth++;
     112            4 :     final entries = _listChildNodes(opts, node, {'li'});
     113            4 :     opts.listDepth--;
     114              :     final bulletPoint =
     115            8 :         _listBulletPoints[opts.listDepth % _listBulletPoints.length];
     116              : 
     117              :     return entries
     118            2 :         .map(
     119            2 :           (s) =>
     120           14 :               '${'    ' * opts.listDepth}$bulletPoint ${s.replaceAll('\n', '\n${'    ' * opts.listDepth}  ')}',
     121              :         )
     122            2 :         .join('\n');
     123              :   }
     124              : 
     125            2 :   static String _parseOlContent(_ConvertOpts opts, Element node) {
     126            4 :     opts.listDepth++;
     127            4 :     final entries = _listChildNodes(opts, node, {'li'});
     128            4 :     opts.listDepth--;
     129            4 :     final startStr = node.attributes['start'];
     130            2 :     final start = (startStr is String &&
     131            4 :             RegExp(r'^[0-9]+$', multiLine: false).hasMatch(startStr))
     132            2 :         ? int.parse(startStr)
     133              :         : 1;
     134              : 
     135              :     return entries
     136            2 :         .mapIndexed(
     137            2 :           (index, s) =>
     138           16 :               '${'    ' * opts.listDepth}${start + index}. ${s.replaceAll('\n', '\n${'    ' * opts.listDepth}  ')}',
     139              :         )
     140            2 :         .join('\n');
     141              :   }
     142              : 
     143              :   static const _listBulletPoints = <String>['●', '○', '■', '‣'];
     144              : 
     145            2 :   static List<String> _listChildNodes(
     146              :     _ConvertOpts opts,
     147              :     Element node, [
     148              :     Iterable<String>? types,
     149              :   ]) {
     150            2 :     final replies = <String>[];
     151            4 :     for (final child in node.nodes) {
     152              :       if (types != null &&
     153            2 :           types.isNotEmpty &&
     154            2 :           ((child is Text) ||
     155            2 :               ((child is Element) &&
     156            6 :                   !types.contains(child.localName!.toLowerCase())))) {
     157              :         continue;
     158              :       }
     159            4 :       replies.add(_walkNode(opts, child));
     160              :     }
     161              :     return replies;
     162              :   }
     163              : 
     164              :   static const _blockTags = <String>{
     165              :     'blockquote',
     166              :     'ul',
     167              :     'ol',
     168              :     'h1',
     169              :     'h2',
     170              :     'h3',
     171              :     'h4',
     172              :     'h5',
     173              :     'h6',
     174              :     'pre',
     175              :   };
     176              : 
     177            4 :   static String _walkChildNodes(_ConvertOpts opts, Node node) {
     178              :     var reply = '';
     179              :     var lastTag = '';
     180            8 :     for (final child in node.nodes) {
     181           12 :       final thisTag = child is Element ? child.localName!.toLowerCase() : '';
     182            8 :       if (thisTag == 'p' && lastTag == 'p') {
     183            2 :         reply += '\n\n';
     184            4 :       } else if (_blockTags.contains(thisTag) &&
     185            4 :           reply.isNotEmpty &&
     186            8 :           reply[reply.length - 1] != '\n') {
     187            2 :         reply += '\n';
     188              :       }
     189            8 :       reply += _walkNode(opts, child);
     190            4 :       if (thisTag.isNotEmpty) {
     191              :         lastTag = thisTag;
     192              :       }
     193              :     }
     194              :     return reply;
     195              :   }
     196              : 
     197            4 :   static String _walkNode(_ConvertOpts opts, Node node) {
     198            4 :     if (node is Text) {
     199              :       // ignore \n between single nodes
     200           12 :       return node.text == '\n' ? '' : node.text;
     201            4 :     } else if (node is Element) {
     202            8 :       final tag = node.localName!.toLowerCase();
     203              :       switch (tag) {
     204            4 :         case 'em':
     205            4 :         case 'i':
     206            8 :           return '*${_walkChildNodes(opts, node)}*';
     207            4 :         case 'strong':
     208            4 :         case 'b':
     209            8 :           return '**${_walkChildNodes(opts, node)}**';
     210            4 :         case 'u':
     211            4 :         case 'ins':
     212            4 :           return '__${_walkChildNodes(opts, node)}__';
     213            4 :         case 'del':
     214            4 :         case 'strike':
     215            4 :         case 's':
     216            4 :           return '~~${_walkChildNodes(opts, node)}~~';
     217            4 :         case 'code':
     218            4 :           return '`${node.text}`';
     219            4 :         case 'pre':
     220            4 :           return '```${_parsePreContent(opts, node)}```\n';
     221            4 :         case 'a':
     222            8 :           final href = node.attributes['href'] ?? '';
     223            4 :           final content = _walkChildNodes(opts, node);
     224            8 :           if (href.toLowerCase().startsWith('https://matrix.to/#/') ||
     225            8 :               href.toLowerCase().startsWith('matrix:')) {
     226              :             return content;
     227              :           }
     228            4 :           return '🔗$content';
     229            4 :         case 'img':
     230            4 :           return node.attributes['alt'] ??
     231            0 :               node.attributes['title'] ??
     232            0 :               node.attributes['src'] ??
     233              :               '';
     234            4 :         case 'br':
     235              :           return '\n';
     236            4 :         case 'blockquote':
     237            2 :           return _parseBlockquoteContent(opts, node);
     238            4 :         case 'ul':
     239            2 :           return _parseUlContent(opts, node);
     240            4 :         case 'ol':
     241            2 :           return _parseOlContent(opts, node);
     242            4 :         case 'mx-reply':
     243              :           return '';
     244            4 :         case 'hr':
     245              :           return '\n----------\n';
     246            4 :         case 'h1':
     247            4 :         case 'h2':
     248            4 :         case 'h3':
     249            4 :         case 'h4':
     250            4 :         case 'h5':
     251            4 :         case 'h6':
     252           12 :           final mark = '#' * int.parse(tag[1]);
     253            8 :           return '$mark ${_walkChildNodes(opts, node)}\n';
     254            4 :         case 'span':
     255            2 :           return _parseSpanContent(opts, node);
     256              :         default:
     257            4 :           return _walkChildNodes(opts, node);
     258              :       }
     259              :     } else {
     260            4 :       return _walkChildNodes(opts, node);
     261              :     }
     262              :   }
     263              : }
     264              : 
     265              : class _ConvertOpts {
     266              :   int listDepth = 0;
     267              : }
        

Generated by: LCOV version 2.0-1