package io.noties.markwon.html; import androidx.annotation.NonNull; import androidx.annotation.Nullable; import androidx.annotation.VisibleForTesting; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; import io.noties.markwon.html.jsoup.nodes.Attribute; import io.noties.markwon.html.jsoup.nodes.Attributes; import io.noties.markwon.html.jsoup.parser.CharacterReader; import io.noties.markwon.html.jsoup.parser.ParseErrorList; import io.noties.markwon.html.jsoup.parser.Token; import io.noties.markwon.html.jsoup.parser.Tokeniser; import static io.noties.markwon.html.AppendableUtils.appendQuietly; /** * @since 2.0.0 */ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser { @NonNull public static MarkwonHtmlParserImpl create() { return create(HtmlEmptyTagReplacement.create()); } @NonNull public static MarkwonHtmlParserImpl create(@NonNull HtmlEmptyTagReplacement inlineTagReplacement) { return new MarkwonHtmlParserImpl(inlineTagReplacement, TrimmingAppender.create()); } // https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements @VisibleForTesting static final Set INLINE_TAGS; private static final Set VOID_TAGS; // these are the tags that are considered _block_ ones // this parser will ensure that these blocks are started on a new line // other tags that are NOT inline are considered as block tags, but won't have new line // inserted before them // https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements private static final Set BLOCK_TAGS; private static final String TAG_PARAGRAPH = "p"; private static final String TAG_LIST_ITEM = "li"; static { INLINE_TAGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList( "a", "abbr", "acronym", "b", "bdo", "big", "br", "button", "cite", "code", "dfn", "em", "i", "img", "input", "kbd", "label", "map", "object", "q", "samp", "script", "select", "small", "span", "strong", "sub", "sup", "textarea", "time", "tt", "var" ))); VOID_TAGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList( "area", "base", "br", "col", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr" ))); BLOCK_TAGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList( "address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video" ))); } private final HtmlEmptyTagReplacement emptyTagReplacement; private final TrimmingAppender trimmingAppender; private final List inlineTags = new ArrayList<>(0); private HtmlTagImpl.BlockImpl currentBlock = HtmlTagImpl.BlockImpl.root(); private boolean isInsidePreTag; // the thing is: we ensure a new line BEFORE block tag // but not after, so another tag will be placed on the same line (which is wrong) private boolean previousIsBlock; MarkwonHtmlParserImpl( @NonNull HtmlEmptyTagReplacement replacement, @NonNull TrimmingAppender trimmingAppender) { this.emptyTagReplacement = replacement; this.trimmingAppender = trimmingAppender; } @Override public void processFragment( @NonNull T output, @NonNull String htmlFragment) { // we might want to reuse tokeniser (at least when the same output is involved) // as CharacterReader does a bit of initialization (cache etc) as it's // primary usage is parsing a document in one run (not parsing _fragments_) final Tokeniser tokeniser = new Tokeniser(new CharacterReader(htmlFragment), ParseErrorList.noTracking()); while (true) { final Token token = tokeniser.read(); final Token.TokenType tokenType = token.type; if (Token.TokenType.EOF == tokenType) { break; } switch (tokenType) { case StartTag: { final Token.StartTag startTag = (Token.StartTag) token; if (isInlineTag(startTag.normalName)) { processInlineTagStart(output, startTag); } else { processBlockTagStart(output, startTag); } } break; case EndTag: { final Token.EndTag endTag = (Token.EndTag) token; if (isInlineTag(endTag.normalName)) { processInlineTagEnd(output, endTag); } else { processBlockTagEnd(output, endTag); } } break; case Character: { processCharacter(output, ((Token.Character) token)); } break; } // do not forget to reset processed token (even if it's not processed) token.reset(); } } @Override public void flushInlineTags(int documentLength, @NonNull FlushAction action) { if (inlineTags.size() > 0) { if (documentLength > HtmlTag.NO_END) { for (HtmlTagImpl.InlineImpl inline : inlineTags) { inline.closeAt(documentLength); } } List reverseOrder = new ArrayList<>((List) inlineTags); Collections.reverse(reverseOrder); action.apply(reverseOrder); inlineTags.clear(); } else { action.apply(Collections.emptyList()); } } @Override public void flushBlockTags(int documentLength, @NonNull FlushAction action) { HtmlTagImpl.BlockImpl block = currentBlock; while (block.parent != null) { block = block.parent; } if (documentLength > HtmlTag.NO_END) { block.closeAt(documentLength); } final List children = block.children(); if (children.size() > 0) { action.apply(children); } else { action.apply(Collections.emptyList()); } currentBlock = HtmlTagImpl.BlockImpl.root(); } @Override public void reset() { inlineTags.clear(); currentBlock = HtmlTagImpl.BlockImpl.root(); } protected void processInlineTagStart( @NonNull T output, @NonNull Token.StartTag startTag) { final String name = startTag.normalName; final HtmlTagImpl.InlineImpl inline = new HtmlTagImpl.InlineImpl(name, output.length(), extractAttributes(startTag)); ensureNewLineIfPreviousWasBlock(output); if (isVoidTag(name) || startTag.selfClosing) { final String replacement = emptyTagReplacement.replace(inline); if (replacement != null && replacement.length() > 0) { AppendableUtils.appendQuietly(output, replacement); } // the thing is: we will keep this inline tag in the list, // but in case of void-tag that has no replacement, there will be no // possibility to set a span (requires at least one char) inline.closeAt(output.length()); } inlineTags.add(inline); } protected void processInlineTagEnd( @NonNull T output, @NonNull Token.EndTag endTag) { // try to find it, if none found -> ignore final HtmlTagImpl.InlineImpl openInline = findOpenInlineTag(endTag.normalName); if (openInline != null) { // okay, if this tag is empty -> call replacement if (isEmpty(output, openInline)) { appendEmptyTagReplacement(output, openInline); } // close open inline tag openInline.closeAt(output.length()); } } protected void processBlockTagStart( @NonNull T output, @NonNull Token.StartTag startTag) { final String name = startTag.normalName; // block tags (all that are NOT inline -> blocks // there is only one strong rule -> paragraph cannot contain anything // except inline tags if (TAG_PARAGRAPH.equals(currentBlock.name)) { // it must be closed here not matter what we are as here we _assume_ // that it's a block tag currentBlock.closeAt(output.length()); AppendableUtils.appendQuietly(output, '\n'); currentBlock = currentBlock.parent; } else if (TAG_LIST_ITEM.equals(name) && TAG_LIST_ITEM.equals(currentBlock.name)) { // close previous list item if in the same parent currentBlock.closeAt(output.length()); currentBlock = currentBlock.parent; } if (isBlockTag(name)) { isInsidePreTag = "pre".equals(name); ensureNewLine(output); } else { ensureNewLineIfPreviousWasBlock(output); } final int start = output.length(); final HtmlTagImpl.BlockImpl block = HtmlTagImpl.BlockImpl.create(name, start, extractAttributes(startTag), currentBlock); final boolean isVoid = isVoidTag(name) || startTag.selfClosing; if (isVoid) { final String replacement = emptyTagReplacement.replace(block); if (replacement != null && replacement.length() > 0) { AppendableUtils.appendQuietly(output, replacement); } block.closeAt(output.length()); } //noinspection ConstantConditions appendBlockChild(block.parent, block); // if not void start filling-in children if (!isVoid) { this.currentBlock = block; } } protected void processBlockTagEnd( @NonNull T output, @NonNull Token.EndTag endTag) { final String name = endTag.normalName; final HtmlTagImpl.BlockImpl block = findOpenBlockTag(endTag.normalName); if (block != null) { if ("pre".equals(name)) { isInsidePreTag = false; } // okay, if this tag is empty -> call replacement if (isEmpty(output, block)) { appendEmptyTagReplacement(output, block); } block.closeAt(output.length()); // if it's empty -> we do no care about if it's block or not if (!block.isEmpty()) { previousIsBlock = isBlockTag(block.name); } if (TAG_PARAGRAPH.equals(name)) { AppendableUtils.appendQuietly(output, '\n'); } this.currentBlock = block.parent; } } protected void processCharacter( @NonNull T output, @NonNull Token.Character character) { // there are tags: BUTTON, INPUT, SELECT, SCRIPT, TEXTAREA, STYLE // that might have character data that we do not want to display if (isInsidePreTag) { appendQuietly(output, character.getData()); } else { ensureNewLineIfPreviousWasBlock(output); trimmingAppender.append(output, character.getData()); } } protected void appendBlockChild(@NonNull HtmlTagImpl.BlockImpl parent, @NonNull HtmlTagImpl.BlockImpl child) { List children = parent.children; if (children == null) { children = new ArrayList<>(2); parent.children = children; } children.add(child); } @Nullable protected HtmlTagImpl.InlineImpl findOpenInlineTag(@NonNull String name) { HtmlTagImpl.InlineImpl inline; for (int i = inlineTags.size() - 1; i > -1; i--) { inline = inlineTags.get(i); if (name.equals(inline.name) && inline.end < 0) { return inline; } } return null; } @Nullable protected HtmlTagImpl.BlockImpl findOpenBlockTag(@NonNull String name) { HtmlTagImpl.BlockImpl blockTag = currentBlock; while (blockTag != null && !name.equals(blockTag.name) && !blockTag.isClosed()) { blockTag = blockTag.parent; } return blockTag; } protected void ensureNewLineIfPreviousWasBlock(@NonNull T output) { if (previousIsBlock) { ensureNewLine(output); previousIsBlock = false; } } // name here must lower case protected static boolean isInlineTag(@NonNull String name) { return INLINE_TAGS.contains(name); } protected static boolean isVoidTag(@NonNull String name) { return VOID_TAGS.contains(name); } protected static boolean isBlockTag(@NonNull String name) { return BLOCK_TAGS.contains(name); } protected static void ensureNewLine(@NonNull T output) { final int length = output.length(); if (length > 0 && '\n' != output.charAt(length - 1)) { AppendableUtils.appendQuietly(output, '\n'); } } @NonNull protected static Map extractAttributes(@NonNull Token.StartTag startTag) { Map map; final Attributes attributes = startTag.attributes; final int size = attributes.size(); if (size > 0) { map = new HashMap<>(size); for (Attribute attribute : attributes) { map.put(attribute.getKey().toLowerCase(Locale.US), attribute.getValue()); } map = Collections.unmodifiableMap(map); } else { map = Collections.emptyMap(); } return map; } protected static boolean isEmpty( @NonNull T output, @NonNull HtmlTagImpl tag) { return tag.start == output.length(); } protected void appendEmptyTagReplacement( @NonNull T output, @NonNull HtmlTagImpl tag) { final String replacement = emptyTagReplacement.replace(tag); if (replacement != null) { AppendableUtils.appendQuietly(output, replacement); } } }