Markwon/markwon-html/src/main/java/io/noties/markwon/html/MarkwonHtmlParserImpl.java
2023-06-27 12:39:12 +08:00

485 lines
16 KiB
Java

package io.noties.markwon.html;
import androidx.annotation.NonNull;
import androidx.annotation.Nullable;
import androidx.annotation.VisibleForTesting;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import io.noties.markwon.html.jsoup.nodes.Attribute;
import io.noties.markwon.html.jsoup.nodes.Attributes;
import io.noties.markwon.html.jsoup.parser.CharacterReader;
import io.noties.markwon.html.jsoup.parser.ParseErrorList;
import io.noties.markwon.html.jsoup.parser.Token;
import io.noties.markwon.html.jsoup.parser.Tokeniser;
import static io.noties.markwon.html.AppendableUtils.appendQuietly;
/**
* @since 2.0.0
*/
public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
@NonNull
public static MarkwonHtmlParserImpl create() {
return create(HtmlEmptyTagReplacement.create());
}
@NonNull
public static MarkwonHtmlParserImpl create(@NonNull HtmlEmptyTagReplacement inlineTagReplacement) {
return new MarkwonHtmlParserImpl(inlineTagReplacement, TrimmingAppender.create());
}
// https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements
@VisibleForTesting
static final Set<String> INLINE_TAGS;
private static final Set<String> VOID_TAGS;
// these are the tags that are considered _block_ ones
// this parser will ensure that these blocks are started on a new line
// other tags that are NOT inline are considered as block tags, but won't have new line
// inserted before them
// https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
private static final Set<String> BLOCK_TAGS;
private static final String TAG_PARAGRAPH = "p";
private static final String TAG_LIST_ITEM = "li";
static {
INLINE_TAGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
"a", "abbr", "acronym",
"b", "bdo", "big", "br", "button",
"cite", "code",
"dfn",
"em",
"i", "img", "input",
"kbd",
"label",
"map",
"object",
"q",
"samp", "script", "select", "small", "span", "strong", "sub", "sup",
"textarea", "time", "tt",
"var"
)));
VOID_TAGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
"area",
"base", "br",
"col",
"embed",
"hr",
"img", "input",
"keygen",
"link",
"meta",
"param",
"source",
"track",
"wbr"
)));
BLOCK_TAGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
"address", "article", "aside",
"blockquote",
"canvas",
"dd", "div", "dl", "dt",
"fieldset", "figcaption", "figure", "footer", "form",
"h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr",
"li",
"main",
"nav", "noscript",
"ol", "output",
"p", "pre",
"section",
"table", "tfoot",
"ul",
"video"
)));
}
private final HtmlEmptyTagReplacement emptyTagReplacement;
private final TrimmingAppender trimmingAppender;
private final List<HtmlTagImpl.InlineImpl> inlineTags = new ArrayList<>(0);
private HtmlTagImpl.BlockImpl currentBlock = HtmlTagImpl.BlockImpl.root();
private boolean isInsidePreTag;
// the thing is: we ensure a new line BEFORE block tag
// but not after, so another tag will be placed on the same line (which is wrong)
private boolean previousIsBlock;
MarkwonHtmlParserImpl(
@NonNull HtmlEmptyTagReplacement replacement,
@NonNull TrimmingAppender trimmingAppender) {
this.emptyTagReplacement = replacement;
this.trimmingAppender = trimmingAppender;
}
@Override
public <T extends Appendable & CharSequence> void processFragment(
@NonNull T output,
@NonNull String htmlFragment) {
// we might want to reuse tokeniser (at least when the same output is involved)
// as CharacterReader does a bit of initialization (cache etc) as it's
// primary usage is parsing a document in one run (not parsing _fragments_)
final Tokeniser tokeniser = new Tokeniser(new CharacterReader(htmlFragment), ParseErrorList.noTracking());
while (true) {
final Token token = tokeniser.read();
final Token.TokenType tokenType = token.type;
if (Token.TokenType.EOF == tokenType) {
break;
}
switch (tokenType) {
case StartTag: {
final Token.StartTag startTag = (Token.StartTag) token;
if (isInlineTag(startTag.normalName)) {
processInlineTagStart(output, startTag);
} else {
processBlockTagStart(output, startTag);
}
}
break;
case EndTag: {
final Token.EndTag endTag = (Token.EndTag) token;
if (isInlineTag(endTag.normalName)) {
processInlineTagEnd(output, endTag);
} else {
processBlockTagEnd(output, endTag);
}
}
break;
case Character: {
processCharacter(output, ((Token.Character) token));
}
break;
}
// do not forget to reset processed token (even if it's not processed)
token.reset();
}
}
@Override
public void flushInlineTags(int documentLength, @NonNull FlushAction<HtmlTag.Inline> action) {
if (inlineTags.size() > 0) {
if (documentLength > HtmlTag.NO_END) {
for (HtmlTagImpl.InlineImpl inline : inlineTags) {
inline.closeAt(documentLength);
}
}
List<HtmlTag.Inline> reverseOrder =
new ArrayList<>((List<? extends HtmlTag.Inline>) inlineTags);
Collections.reverse(reverseOrder);
action.apply(reverseOrder);
inlineTags.clear();
} else {
action.apply(Collections.<HtmlTag.Inline>emptyList());
}
}
@Override
public void flushBlockTags(int documentLength, @NonNull FlushAction<HtmlTag.Block> action) {
HtmlTagImpl.BlockImpl block = currentBlock;
while (block.parent != null) {
block = block.parent;
}
if (documentLength > HtmlTag.NO_END) {
block.closeAt(documentLength);
}
final List<HtmlTag.Block> children = block.children();
if (children.size() > 0) {
action.apply(children);
} else {
action.apply(Collections.<HtmlTag.Block>emptyList());
}
currentBlock = HtmlTagImpl.BlockImpl.root();
}
@Override
public void reset() {
inlineTags.clear();
currentBlock = HtmlTagImpl.BlockImpl.root();
}
protected <T extends Appendable & CharSequence> void processInlineTagStart(
@NonNull T output,
@NonNull Token.StartTag startTag) {
final String name = startTag.normalName;
final HtmlTagImpl.InlineImpl inline = new HtmlTagImpl.InlineImpl(name, output.length(), extractAttributes(startTag));
ensureNewLineIfPreviousWasBlock(output);
if (isVoidTag(name)
|| startTag.selfClosing) {
final String replacement = emptyTagReplacement.replace(inline);
if (replacement != null
&& replacement.length() > 0) {
AppendableUtils.appendQuietly(output, replacement);
}
// the thing is: we will keep this inline tag in the list,
// but in case of void-tag that has no replacement, there will be no
// possibility to set a span (requires at least one char)
inline.closeAt(output.length());
}
inlineTags.add(inline);
}
protected <T extends Appendable & CharSequence> void processInlineTagEnd(
@NonNull T output,
@NonNull Token.EndTag endTag) {
// try to find it, if none found -> ignore
final HtmlTagImpl.InlineImpl openInline = findOpenInlineTag(endTag.normalName);
if (openInline != null) {
// okay, if this tag is empty -> call replacement
if (isEmpty(output, openInline)) {
appendEmptyTagReplacement(output, openInline);
}
// close open inline tag
openInline.closeAt(output.length());
}
}
protected <T extends Appendable & CharSequence> void processBlockTagStart(
@NonNull T output,
@NonNull Token.StartTag startTag) {
final String name = startTag.normalName;
// block tags (all that are NOT inline -> blocks
// there is only one strong rule -> paragraph cannot contain anything
// except inline tags
if (TAG_PARAGRAPH.equals(currentBlock.name)) {
// it must be closed here not matter what we are as here we _assume_
// that it's a block tag
currentBlock.closeAt(output.length());
AppendableUtils.appendQuietly(output, '\n');
currentBlock = currentBlock.parent;
} else if (TAG_LIST_ITEM.equals(name)
&& TAG_LIST_ITEM.equals(currentBlock.name)) {
// close previous list item if in the same parent
currentBlock.closeAt(output.length());
currentBlock = currentBlock.parent;
}
if (isBlockTag(name)) {
isInsidePreTag = "pre".equals(name);
ensureNewLine(output);
} else {
ensureNewLineIfPreviousWasBlock(output);
}
final int start = output.length();
final HtmlTagImpl.BlockImpl block = HtmlTagImpl.BlockImpl.create(name, start, extractAttributes(startTag), currentBlock);
final boolean isVoid = isVoidTag(name) || startTag.selfClosing;
if (isVoid) {
final String replacement = emptyTagReplacement.replace(block);
if (replacement != null
&& replacement.length() > 0) {
AppendableUtils.appendQuietly(output, replacement);
}
block.closeAt(output.length());
}
//noinspection ConstantConditions
appendBlockChild(block.parent, block);
// if not void start filling-in children
if (!isVoid) {
this.currentBlock = block;
}
}
protected <T extends Appendable & CharSequence> void processBlockTagEnd(
@NonNull T output,
@NonNull Token.EndTag endTag) {
final String name = endTag.normalName;
final HtmlTagImpl.BlockImpl block = findOpenBlockTag(endTag.normalName);
if (block != null) {
if ("pre".equals(name)) {
isInsidePreTag = false;
}
// okay, if this tag is empty -> call replacement
if (isEmpty(output, block)) {
appendEmptyTagReplacement(output, block);
}
block.closeAt(output.length());
// if it's empty -> we do no care about if it's block or not
if (!block.isEmpty()) {
previousIsBlock = isBlockTag(block.name);
}
if (TAG_PARAGRAPH.equals(name)) {
AppendableUtils.appendQuietly(output, '\n');
}
this.currentBlock = block.parent;
}
}
protected <T extends Appendable & CharSequence> void processCharacter(
@NonNull T output,
@NonNull Token.Character character) {
// there are tags: BUTTON, INPUT, SELECT, SCRIPT, TEXTAREA, STYLE
// that might have character data that we do not want to display
if (isInsidePreTag) {
appendQuietly(output, character.getData());
} else {
ensureNewLineIfPreviousWasBlock(output);
trimmingAppender.append(output, character.getData());
}
}
protected void appendBlockChild(@NonNull HtmlTagImpl.BlockImpl parent, @NonNull HtmlTagImpl.BlockImpl child) {
List<HtmlTagImpl.BlockImpl> children = parent.children;
if (children == null) {
children = new ArrayList<>(2);
parent.children = children;
}
children.add(child);
}
@Nullable
protected HtmlTagImpl.InlineImpl findOpenInlineTag(@NonNull String name) {
HtmlTagImpl.InlineImpl inline;
for (int i = inlineTags.size() - 1; i > -1; i--) {
inline = inlineTags.get(i);
if (name.equals(inline.name)
&& inline.end < 0) {
return inline;
}
}
return null;
}
@Nullable
protected HtmlTagImpl.BlockImpl findOpenBlockTag(@NonNull String name) {
HtmlTagImpl.BlockImpl blockTag = currentBlock;
while (blockTag != null
&& !name.equals(blockTag.name) && !blockTag.isClosed()) {
blockTag = blockTag.parent;
}
return blockTag;
}
protected <T extends Appendable & CharSequence> void ensureNewLineIfPreviousWasBlock(@NonNull T output) {
if (previousIsBlock) {
ensureNewLine(output);
previousIsBlock = false;
}
}
// name here must lower case
protected static boolean isInlineTag(@NonNull String name) {
return INLINE_TAGS.contains(name);
}
protected static boolean isVoidTag(@NonNull String name) {
return VOID_TAGS.contains(name);
}
protected static boolean isBlockTag(@NonNull String name) {
return BLOCK_TAGS.contains(name);
}
protected static <T extends Appendable & CharSequence> void ensureNewLine(@NonNull T output) {
final int length = output.length();
if (length > 0
&& '\n' != output.charAt(length - 1)) {
AppendableUtils.appendQuietly(output, '\n');
}
}
@NonNull
protected static Map<String, String> extractAttributes(@NonNull Token.StartTag startTag) {
Map<String, String> map;
final Attributes attributes = startTag.attributes;
final int size = attributes.size();
if (size > 0) {
map = new HashMap<>(size);
for (Attribute attribute : attributes) {
map.put(attribute.getKey().toLowerCase(Locale.US), attribute.getValue());
}
map = Collections.unmodifiableMap(map);
} else {
map = Collections.emptyMap();
}
return map;
}
protected static <T extends Appendable & CharSequence> boolean isEmpty(
@NonNull T output,
@NonNull HtmlTagImpl tag) {
return tag.start == output.length();
}
protected <T extends Appendable & CharSequence> void appendEmptyTagReplacement(
@NonNull T output,
@NonNull HtmlTagImpl tag) {
final String replacement = emptyTagReplacement.replace(tag);
if (replacement != null) {
AppendableUtils.appendQuietly(output, replacement);
}
}
}