Added 2 modules: html-parser-api and html-parser-impl

2018-08-17 12:53:36 +03:00 · 2018-08-17 12:53:36 +03:00 · ff3bedc37e
commit ff3bedc37e
parent 7c7b1f59a8
24 changed files with 4939 additions and 1 deletions
--- a/html-parser-api/build.gradle
+++ b/html-parser-api/build.gradle
@ -0,0 +1,31 @@
+apply plugin: 'com.android.library'
+
+android {
+
+    compileSdkVersion TARGET_SDK
+    buildToolsVersion BUILD_TOOLS
+
+    defaultConfig {
+        minSdkVersion MIN_SDK
+        targetSdkVersion TARGET_SDK
+        versionCode 1
+        versionName version
+    }
+}
+
+dependencies {
+    api SUPPORT_ANNOTATIONS
+}
+
+afterEvaluate {
+    generateReleaseBuildConfig.enabled = false
+}
+
+// todo: remove `local` check after merge with latest version (1.1.1)
+if (hasProperty('release')) {
+    if (hasProperty('local')) {
+        ext.RELEASE_REPOSITORY_URL = LOCAL_MAVEN_URL
+        ext.SNAPSHOT_REPOSITORY_URL = LOCAL_MAVEN_URL
+    }
+    apply from: 'https://raw.githubusercontent.com/noties/gradle-mvn-push/master/gradle-mvn-push-aar.gradle'
+}
--- a/html-parser-api/src/main/AndroidManifest.xml
+++ b/html-parser-api/src/main/AndroidManifest.xml
@ -0,0 +1 @@
+<manifest package="ru.noties.markwon.html" />
--- a/html-parser-api/src/main/java/ru/noties/markwon/html/HtmlTag.java
+++ b/html-parser-api/src/main/java/ru/noties/markwon/html/HtmlTag.java
@ -0,0 +1,54 @@
+package ru.noties.markwon.html;
+
+import android.support.annotation.NonNull;
+import android.support.annotation.Nullable;
+
+import java.util.List;
+
+/**
+ * @see Inline
+ * @see Block
+ */
+public interface HtmlTag {
+
+    /**
+     * @return normalized tag name (lower-case)
+     */
+    @NonNull
+    String name();
+
+    /**
+     * @return index at which this tag starts
+     */
+    int start();
+
+    /**
+     * @return index at which this tag ends
+     */
+    int end();
+
+    /**
+     * Represents <em>really</em> inline HTML tags (unline commonmark definitions)
+     */
+    interface Inline extends HtmlTag {
+    }
+
+    /**
+     * Represents HTML block tags. Please note that all tags that are not inline should be
+     * considered as block tags
+     */
+    interface Block extends HtmlTag {
+
+        /**
+         * @return parent {@link Block} or null if there is no parent (this block is at root level)
+         */
+        @Nullable
+        Block parent();
+
+        /**
+         * @return list of children
+         */
+        @NonNull
+        List<Block> children();
+    }
+}
--- a/html-parser-api/src/main/java/ru/noties/markwon/html/MarkwonHtmlParser.java
+++ b/html-parser-api/src/main/java/ru/noties/markwon/html/MarkwonHtmlParser.java
@ -0,0 +1,36 @@
+package ru.noties.markwon.html;
+
+import android.support.annotation.NonNull;
+
+import java.util.List;
+
+public abstract class MarkwonHtmlParser {
+
+    @NonNull
+    public static MarkwonHtmlParser noOp() {
+        return new MarkwonHtmlParserNoOp();
+    }
+
+    public interface FlushAction<T> {
+        void apply(@NonNull List<T> tags);
+    }
+
+    public abstract <T extends Appendable & CharSequence> void processFragment(
+            @NonNull T output,
+            @NonNull String htmlFragment);
+
+    // clear all pending tags (if any)
+    // todo: we also can do this: if supplied value is -1 (for example) we ignore tags that are not closed
+    public abstract void flushInlineTags(
+            int documentLength,
+            @NonNull FlushAction<HtmlTag.Inline> action);
+
+    // clear all pending blocks if any
+    // todo: we also can do this: if supplied value is -1 (for example) we ignore tags that are not closed
+    public abstract void flushBlockTags(
+            int documentLength,
+            @NonNull FlushAction<HtmlTag.Block> action);
+
+    public abstract void reset();
+
+}
--- a/html-parser-api/src/main/java/ru/noties/markwon/html/MarkwonHtmlParserNoOp.java
+++ b/html-parser-api/src/main/java/ru/noties/markwon/html/MarkwonHtmlParserNoOp.java
@ -0,0 +1,26 @@
+package ru.noties.markwon.html;
+
+import android.support.annotation.NonNull;
+
+class MarkwonHtmlParserNoOp extends MarkwonHtmlParser {
+
+    @Override
+    public <T extends Appendable & CharSequence> void processFragment(@NonNull T output, @NonNull String htmlFragment) {
+
+    }
+
+    @Override
+    public void flushInlineTags(int documentLength, @NonNull FlushAction<HtmlTag.Inline> action) {
+
+    }
+
+    @Override
+    public void flushBlockTags(int documentLength, @NonNull FlushAction<HtmlTag.Block> action) {
+
+    }
+
+    @Override
+    public void reset() {
+
+    }
+}
--- a/html-parser-impl/build.gradle
+++ b/html-parser-impl/build.gradle
@ -0,0 +1,32 @@
+apply plugin: 'com.android.library'
+
+android {
+
+    compileSdkVersion TARGET_SDK
+    buildToolsVersion BUILD_TOOLS
+
+    defaultConfig {
+        minSdkVersion MIN_SDK
+        targetSdkVersion TARGET_SDK
+        versionCode 1
+        versionName version
+    }
+}
+
+dependencies {
+    api SUPPORT_ANNOTATIONS
+    api project(':html-parser-api')
+}
+
+afterEvaluate {
+    generateReleaseBuildConfig.enabled = false
+}
+
+// todo: remove `local` check after merge with latest version (1.1.1)
+if (hasProperty('release')) {
+    if (hasProperty('local')) {
+        ext.RELEASE_REPOSITORY_URL = LOCAL_MAVEN_URL
+        ext.SNAPSHOT_REPOSITORY_URL = LOCAL_MAVEN_URL
+    }
+    apply from: 'https://raw.githubusercontent.com/noties/gradle-mvn-push/master/gradle-mvn-push-aar.gradle'
+}
--- a/html-parser-impl/src/main/AndroidManifest.xml
+++ b/html-parser-impl/src/main/AndroidManifest.xml
@ -0,0 +1 @@
+<manifest package="ru.noties.markwon.html" />
--- a/html-parser-impl/src/main/java/ru/noties/markwon/html/HtmlTagImpl.java
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/HtmlTagImpl.java
@ -0,0 +1,117 @@
+package ru.noties.markwon.html;
+
+import android.support.annotation.NonNull;
+import android.support.annotation.Nullable;
+
+import java.util.Collections;
+import java.util.List;
+
+abstract class HtmlTagImpl implements HtmlTag {
+
+    static final int NO_VALUE = -1;
+
+    final String name;
+    final int start;
+    int end = NO_VALUE;
+
+    protected HtmlTagImpl(@NonNull String name, int start) {
+        this.name = name;
+        this.start = start;
+    }
+
+    @NonNull
+    @Override
+    public String name() {
+        return name;
+    }
+
+    @Override
+    public int start() {
+        return start;
+    }
+
+    @Override
+    public int end() {
+        return end;
+    }
+
+    boolean isClosed() {
+        return end > NO_VALUE;
+    }
+
+    abstract void closeAt(int end);
+
+
+    static class InlineImpl extends HtmlTagImpl implements Inline {
+
+        InlineImpl(@NonNull String name, int start) {
+            super(name, start);
+        }
+
+        @Override
+        void closeAt(int end) {
+            if (!isClosed()) {
+                super.end = end;
+            }
+        }
+    }
+
+    static class BlockImpl extends HtmlTagImpl implements Block {
+
+        @NonNull
+        static BlockImpl root() {
+            //noinspection ConstantConditions
+            return new BlockImpl("", 0, null);
+        }
+
+        @NonNull
+        static BlockImpl create(@NonNull String name, int start, @NonNull BlockImpl parent) {
+            return new BlockImpl(name, start, parent);
+        }
+
+        final BlockImpl parent;
+        List<BlockImpl> children;
+
+        @SuppressWarnings("NullableProblems")
+        BlockImpl(@NonNull String name, int start, @NonNull BlockImpl parent) {
+            super(name, start);
+            this.parent = parent;
+        }
+
+        @Override
+        void closeAt(int end) {
+            if (!isClosed()) {
+                super.end = end;
+                if (children != null) {
+                    for (BlockImpl child: children) {
+                        child.closeAt(end);
+                    }
+                    children = Collections.unmodifiableList(children);
+                } else {
+                    children = Collections.emptyList();
+                }
+            }
+        }
+
+        boolean isRoot() {
+            return parent == null;
+        }
+
+        @Nullable
+        @Override
+        public Block parent() {
+            if (parent == null) {
+                throw new IllegalStateException("#parent() getter was called on the root node " +
+                        "which should not be exposed outside internal usage");
+            }
+            return parent;
+        }
+
+        @NonNull
+        @Override
+        public List<Block> children() {
+            //noinspection unchecked
+            return (List<Block>) (List<? extends Block>) children;
+        }
+    }
+}
--- a/html-parser-impl/src/main/java/ru/noties/markwon/html/MarkwonHtmlParserImpl.java
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/MarkwonHtmlParserImpl.java
@ -0,0 +1,396 @@
+package ru.noties.markwon.html;
+
+import android.support.annotation.NonNull;
+import android.support.annotation.Nullable;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import ru.noties.markwon.html.HtmlTag.Block;
+import ru.noties.markwon.html.HtmlTag.Inline;
+import ru.noties.markwon.html.HtmlTagImpl.BlockImpl;
+import ru.noties.markwon.html.HtmlTagImpl.InlineImpl;
+import ru.noties.markwon.html.jsoup.parser.CharacterReader;
+import ru.noties.markwon.html.jsoup.parser.ParseErrorList;
+import ru.noties.markwon.html.jsoup.parser.Token;
+import ru.noties.markwon.html.jsoup.parser.Tokeniser;
+
+public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
+
+    @NonNull
+    public static MarkwonHtmlParserImpl create() {
+        return new MarkwonHtmlParserImpl();
+    }
+
+    // https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements
+    private static final Set<String> INLINE_TAGS;
+
+    private static final Set<String> VOID_TAGS;
+
+    // these are the tags that are considered _block_ ones
+    // this parser will ensure that these blocks are started on a new line
+    // other tags that are NOT inline are considered as block tags, but won't have new line
+    // inserted before them
+    // https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
+    private static final Set<String> BLOCK_TAGS;
+
+    private static final String TAG_PARAGRAPH = "p";
+    private static final String TAG_LIST_ITEM = "li";
+
+    // todo: make it configurable
+    private static final String IMG_REPLACEMENT = "\uFFFC";
+
+    static {
+        INLINE_TAGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
+                "a", "abbr", "acronym",
+                "b", "bdo", "big", "br", "button",
+                "cite", "code",
+                "dfn",
+                "em",
+                "i", "img", "input",
+                "kbd",
+                "label",
+                "map",
+                "object",
+                "q",
+                "samp", "script", "select", "small", "span", "strong", "sub", "sup",
+                "textarea", "time", "tt",
+                "var"
+        )));
+        VOID_TAGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
+                "area",
+                "base", "br",
+                "col",
+                "embed",
+                "hr",
+                "img", "input",
+                "keygen",
+                "link",
+                "meta",
+                "param",
+                "source",
+                "track",
+                "wbr"
+        )));
+        BLOCK_TAGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
+                "address", "article", "aside",
+                "blockquote",
+                "canvas",
+                "dd", "div", "dl", "dt",
+                "fieldset", "figcaption", "figure", "footer", "form",
+                "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr",
+                "li",
+                "main",
+                "nav", "noscript",
+                "ol", "output",
+                "p", "pre",
+                "section",
+                "table", "tfoot",
+                "ul",
+                "video"
+        )));
+    }
+
+    private final List<InlineImpl> inlineTags = new ArrayList<>(0);
+
+    private BlockImpl currentBlock = BlockImpl.root();
+
+
+    @Override
+    public <T extends Appendable & CharSequence> void processFragment(
+            @NonNull T output,
+            @NonNull String htmlFragment) {
+
+        // todo: maybe there is a way to reuse tokeniser...
+        final Tokeniser tokeniser = new Tokeniser(new CharacterReader(htmlFragment), ParseErrorList.noTracking());
+
+        while (true) {
+
+            final Token token = tokeniser.read();
+            final Token.TokenType tokenType = token.type;
+
+            if (Token.TokenType.EOF == tokenType) {
+                break;
+            }
+
+            switch (tokenType) {
+
+                case StartTag: {
+
+                    final Token.StartTag startTag = (Token.StartTag) token;
+
+                    if (isInlineTag(startTag.normalName)) {
+                        processInlineTagStart(output, startTag);
+                    } else {
+                        processBlockTagStart(output, startTag);
+                    }
+                }
+                break;
+
+                case EndTag: {
+
+                    final Token.EndTag endTag = (Token.EndTag) token;
+
+                    if (isInlineTag(endTag.normalName)) {
+                        processInlineTagEnd(output, endTag);
+                    } else {
+                        processBlockTagEnd(output, endTag);
+                    }
+                }
+                break;
+
+                case Character: {
+                    processCharacter(output, ((Token.Character) token));
+                }
+                break;
+            }
+
+            // do not forget to reset processed token (even if it's not processed)
+            token.reset();
+        }
+    }
+
+    @Override
+    public void flushInlineTags(int documentLength, @NonNull FlushAction<Inline> action) {
+        if (inlineTags.size() > 0) {
+            for (InlineImpl inline : inlineTags) {
+                inline.closeAt(documentLength);
+            }
+            //noinspection unchecked
+            action.apply(Collections.unmodifiableList((List<? extends Inline>) inlineTags));
+            inlineTags.clear();
+        }
+    }
+
+    @Override
+    public void flushBlockTags(int documentLength, @NonNull FlushAction<Block> action) {
+
+        BlockImpl block = currentBlock;
+        while (!block.isRoot()) {
+            block = block.parent;
+        }
+
+        block.closeAt(documentLength);
+
+        final List<Block> children = block.children();
+        if (children.size() > 0) {
+            action.apply(children);
+        }
+
+        currentBlock = BlockImpl.root();
+    }
+
+    @Override
+    public void reset() {
+        inlineTags.clear();
+        currentBlock = BlockImpl.root();
+    }
+
+
+    protected <T extends Appendable & CharSequence> void processInlineTagStart(
+            @NonNull T output,
+            @NonNull Token.StartTag startTag) {
+
+        final String name = startTag.normalName;
+
+        final InlineImpl inline = new InlineImpl(name, output.length());
+
+        if (isVoidTag(name)
+                || startTag.selfClosing) {
+
+            // check if we have content to append as we must close this tag here
+            processVoidTag(output, startTag);
+
+            inline.end = output.length();
+        }
+
+        // actually only check if there is content for void/self-closing tags
+        // if none -> ignore it
+        if (inline.start != inline.end) {
+            inlineTags.add(inline);
+        }
+    }
+
+    protected <T extends Appendable & CharSequence> void processInlineTagEnd(
+            @NonNull T output,
+            @NonNull Token.EndTag endTag) {
+
+        // try to find it, if none found -> ignore
+        final InlineImpl openInlineTag = findOpenInlineTag(endTag.normalName);
+        if (openInlineTag != null) {
+            // close open inline tag
+            openInlineTag.end = output.length();
+        }
+    }
+
+
+    protected <T extends Appendable & CharSequence> void processBlockTagStart(
+            @NonNull T output,
+            @NonNull Token.StartTag startTag) {
+
+        final String name = startTag.normalName;
+
+        // block tags (all that are NOT inline -> blocks
+        // I think there is only one strong rule -> paragraph cannot contain anything
+        // except inline tags
+        // also, closing paragraph with non-closed inlines -> doesn't close inlines
+        // they are continued for _afterwards_
+
+        if (TAG_PARAGRAPH.equals(currentBlock.name)) {
+            // it must be closed here not matter what we are as here we _assume_
+            // that it's a block tag
+            append(output, "\n");
+            currentBlock.end = output.length();
+            currentBlock = currentBlock.parent;
+        } else if (TAG_LIST_ITEM.equals(name)
+                && TAG_LIST_ITEM.equals(currentBlock.name)) {
+            // close previous list item if in the same parent
+            currentBlock.end = output.length();
+            currentBlock = currentBlock.parent;
+        }
+
+        if (isBlockTag(name)) {
+            ensureNewLine(output);
+        }
+
+        final int start = output.length();
+
+        final BlockImpl block = BlockImpl.create(name, start, currentBlock);
+
+        //noinspection ConstantConditions
+        appendBlockChild(block.parent, block);
+
+        this.currentBlock = block;
+    }
+
+    protected <T extends Appendable & CharSequence> void processBlockTagEnd(
+            @NonNull T output,
+            @NonNull Token.EndTag endTag) {
+
+        final String name = endTag.normalName;
+
+        final BlockImpl block = findOpenBlockTag(endTag.normalName);
+        if (block != null) {
+
+            if (TAG_PARAGRAPH.equals(name)) {
+                append(output, "\n");
+            }
+
+            block.closeAt(output.length());
+            this.currentBlock = block.parent;
+        }
+    }
+
+    protected <T extends Appendable & CharSequence> void processVoidTag(
+            @NonNull T output,
+            @NonNull Token.StartTag startTag) {
+
+        final String name = startTag.normalName;
+
+        if ("br".equals(name)) {
+            append(output, "\n");
+        } else if ("img".equals(name)) {
+            final String alt = startTag.attributes.getIgnoreCase("alt");
+            if (alt == null
+                    || alt.length() == 0) {
+                // no alt is provided
+                append(output, IMG_REPLACEMENT);
+            } else {
+                append(output, alt);
+            }
+        }
+
+        // other tags are ignored
+    }
+
+    protected <T extends Appendable & CharSequence> void processCharacter(
+            @NonNull T output,
+            @NonNull Token.Character character) {
+
+        // the thing here is: if it's a script tag that we are inside -> we must not treat this
+        // as the text to append... should we even care about this? how many people are
+        // going to include freaking script tags as html inline?
+        //
+        // so tags are: BUTTON, INPUT, SELECT, SCRIPT, TEXTAREA
+        //
+        // actually we must decide it here: should we append freaking characters for these _bad_
+        // tags or not, as later we won't be able to change it and/or allow modification (as
+        // all indexes will be affected with this)
+
+        // for now: ignore the inline context
+        append(output, character.getData());
+    }
+
+    protected void appendBlockChild(@NonNull BlockImpl parent, @NonNull BlockImpl child) {
+        List<BlockImpl> children = parent.children;
+        if (children == null) {
+            children = new ArrayList<>(2);
+            parent.children = children;
+        }
+        children.add(child);
+    }
+
+    @Nullable
+    protected InlineImpl findOpenInlineTag(@NonNull String name) {
+
+        InlineImpl inline;
+
+        for (int i = inlineTags.size() - 1; i > -1; i--) {
+            inline = inlineTags.get(i);
+            if (name.equals(inline.name)
+                    && inline.end < 0) {
+                return inline;
+            }
+        }
+
+        return null;
+    }
+
+    @Nullable
+    protected BlockImpl findOpenBlockTag(@NonNull String name) {
+
+        BlockImpl blockTag = currentBlock;
+
+        while (blockTag != null
+                && !name.equals(blockTag.name)) {
+            blockTag = blockTag.parent;
+        }
+
+        return blockTag;
+    }
+
+    // name here must lower case
+    protected static boolean isInlineTag(@NonNull String name) {
+        return INLINE_TAGS.contains(name);
+    }
+
+    protected static boolean isVoidTag(@NonNull String name) {
+        return VOID_TAGS.contains(name);
+    }
+
+    protected static boolean isBlockTag(@NonNull String name) {
+        return BLOCK_TAGS.contains(name);
+    }
+
+    protected static void append(@NonNull Appendable appendable, @NonNull CharSequence text) {
+        try {
+            appendable.append(text);
+        } catch (IOException e) {
+            // _must_ not happen
+            throw new RuntimeException(e);
+        }
+    }
+
+    protected static <T extends Appendable & CharSequence> void ensureNewLine(@NonNull T output) {
+        final int length = output.length();
+        if (length > 0
+                && '\n' != output.charAt(length - 1)) {
+            append(output, "\n");
+        }
+    }
+}
--- a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/UncheckedIOException.java
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/UncheckedIOException.java
@ -0,0 +1,13 @@
+package ru.noties.markwon.html.jsoup;
+
+import java.io.IOException;
+
+public class UncheckedIOException extends RuntimeException {
+    public UncheckedIOException(IOException cause) {
+        super(cause);
+    }
+
+    public IOException ioException() {
+        return (IOException) getCause();
+    }
+}
--- a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/helper/Normalizer.java
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/helper/Normalizer.java
@ -0,0 +1,18 @@
+package ru.noties.markwon.html.jsoup.helper;
+
+import java.util.Locale;
+
+/**
+ * Util methods for normalizing strings. Jsoup internal use only, please don't depend on this API.
+ */
+public final class Normalizer {
+
+    public static String lowerCase(final String input) {
+        return input != null ? input.toLowerCase(Locale.ENGLISH) : "";
+    }
+
+    public static String normalize(final String input) {
+        return lowerCase(input).trim();
+    }
+}
+
--- a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/helper/Validate.java
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/helper/Validate.java
@ -0,0 +1,112 @@
+package ru.noties.markwon.html.jsoup.helper;
+
+/**
+ * Simple validation methods. Designed for jsoup internal use
+ */
+public final class Validate {
+
+    private Validate() {}
+
+    /**
+     * Validates that the object is not null
+     * @param obj object to test
+     */
+    public static void notNull(Object obj) {
+        if (obj == null)
+            throw new IllegalArgumentException("Object must not be null");
+    }
+
+    /**
+     * Validates that the object is not null
+     * @param obj object to test
+     * @param msg message to output if validation fails
+     */
+    public static void notNull(Object obj, String msg) {
+        if (obj == null)
+            throw new IllegalArgumentException(msg);
+    }
+
+    /**
+     * Validates that the value is true
+     * @param val object to test
+     */
+    public static void isTrue(boolean val) {
+        if (!val)
+            throw new IllegalArgumentException("Must be true");
+    }
+
+    /**
+     * Validates that the value is true
+     * @param val object to test
+     * @param msg message to output if validation fails
+     */
+    public static void isTrue(boolean val, String msg) {
+        if (!val)
+            throw new IllegalArgumentException(msg);
+    }
+
+    /**
+     * Validates that the value is false
+     * @param val object to test
+     */
+    public static void isFalse(boolean val) {
+        if (val)
+            throw new IllegalArgumentException("Must be false");
+    }
+
+    /**
+     * Validates that the value is false
+     * @param val object to test
+     * @param msg message to output if validation fails
+     */
+    public static void isFalse(boolean val, String msg) {
+        if (val)
+            throw new IllegalArgumentException(msg);
+    }
+
+    /**
+     * Validates that the array contains no null elements
+     * @param objects the array to test
+     */
+    public static void noNullElements(Object[] objects) {
+        noNullElements(objects, "Array must not contain any null objects");
+    }
+
+    /**
+     * Validates that the array contains no null elements
+     * @param objects the array to test
+     * @param msg message to output if validation fails
+     */
+    public static void noNullElements(Object[] objects, String msg) {
+        for (Object obj : objects)
+            if (obj == null)
+                throw new IllegalArgumentException(msg);
+    }
+
+    /**
+     * Validates that the string is not empty
+     * @param string the string to test
+     */
+    public static void notEmpty(String string) {
+        if (string == null || string.length() == 0)
+            throw new IllegalArgumentException("String must not be empty");
+    }
+
+    /**
+     * Validates that the string is not empty
+     * @param string the string to test
+     * @param msg message to output if validation fails
+     */
+    public static void notEmpty(String string, String msg) {
+        if (string == null || string.length() == 0)
+            throw new IllegalArgumentException(msg);
+    }
+
+    /**
+     Cause a failure.
+     @param msg message to output.
+     */
+    public static void fail(String msg) {
+        throw new IllegalArgumentException(msg);
+    }
+}
--- a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Attribute.java
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Attribute.java
@ -0,0 +1,202 @@
+package ru.noties.markwon.html.jsoup.nodes;
+
+import java.util.Map;
+
+import ru.noties.markwon.html.jsoup.helper.Validate;
+
+/**
+ A single key + value attribute. (Only used for presentation.)
+ */
+public class Attribute implements Map.Entry<String, String>, Cloneable  {
+//    private static final String[] booleanAttributes = {
+//            "allowfullscreen", "async", "autofocus", "checked", "compact", "declare", "default", "defer", "disabled",
+//            "formnovalidate", "hidden", "inert", "ismap", "itemscope", "multiple", "muted", "nohref", "noresize",
+//            "noshade", "novalidate", "nowrap", "open", "readonly", "required", "reversed", "seamless", "selected",
+//            "sortable", "truespeed", "typemustmatch"
+//    };
+
+    private String key;
+    private String val;
+    Attributes parent; // used to update the holding Attributes when the key / value is changed via this interface
+
+    /**
+     * Create a new attribute from unencoded (raw) key and value.
+     * @param key attribute key; case is preserved.
+     * @param value attribute value
+     */
+    public Attribute(String key, String value) {
+        this(key, value, null);
+    }
+
+    /**
+     * Create a new attribute from unencoded (raw) key and value.
+     * @param key attribute key; case is preserved.
+     * @param val attribute value
+     * @param parent the containing Attributes (this Attribute is not automatically added to said Attributes)
+     */
+    public Attribute(String key, String val, Attributes parent) {
+        Validate.notNull(key);
+        this.key = key.trim();
+        Validate.notEmpty(key); // trimming could potentially make empty, so validate here
+        this.val = val;
+        this.parent = parent;
+    }
+
+    /**
+     Get the attribute key.
+     @return the attribute key
+     */
+    public String getKey() {
+        return key;
+    }
+
+    /**
+     Set the attribute key; case is preserved.
+     @param key the new key; must not be null
+     */
+    public void setKey(String key) {
+        Validate.notNull(key);
+        key = key.trim();
+        Validate.notEmpty(key); // trimming could potentially make empty, so validate here
+        if (parent != null) {
+            int i = parent.indexOfKey(this.key);
+            if (i != Attributes.NotFound)
+                parent.keys[i] = key;
+        }
+        this.key = key;
+    }
+
+    /**
+     Get the attribute value.
+     @return the attribute value
+     */
+    public String getValue() {
+        return val;
+    }
+
+    /**
+     Set the attribute value.
+     @param val the new attribute value; must not be null
+     */
+    public String setValue(String val) {
+        String oldVal = parent.get(this.key);
+        if (parent != null) {
+            int i = parent.indexOfKey(this.key);
+            if (i != Attributes.NotFound)
+                parent.vals[i] = val;
+        }
+        this.val = val;
+        return oldVal;
+    }
+
+//    /**
+//     Get the HTML representation of this attribute; e.g. {@code href="index.html"}.
+//     @return HTML
+//     */
+//    public String html() {
+//        StringBuilder accum = new StringBuilder();
+//
+//        try {
+//            html(accum, (new Document("")).outputSettings());
+//        } catch(IOException exception) {
+//            throw new SerializationException(exception);
+//        }
+//        return accum.toString();
+//    }
+//
+//    protected static void html(String key, String val, Appendable accum, Document.OutputSettings out) throws IOException {
+//        accum.append(key);
+//        if (!shouldCollapseAttribute(key, val, out)) {
+//            accum.append("=\"");
+//            Entities.escape(accum, Attributes.checkNotNull(val) , out, true, false, false);
+//            accum.append('"');
+//        }
+//    }
+//
+//    protected void html(Appendable accum, Document.OutputSettings out) throws IOException {
+//        html(key, val, accum, out);
+//    }
+
+//    /**
+//     Get the string representation of this attribute, implemented as {@link #html()}.
+//     @return string
+//     */
+//    @Override
+//    public String toString() {
+//        return html();
+//    }
+
+//    /**
+//     * Create a new Attribute from an unencoded key and a HTML attribute encoded value.
+//     * @param unencodedKey assumes the key is not encoded, as can be only run of simple \w chars.
+//     * @param encodedValue HTML attribute encoded value
+//     * @return attribute
+//     */
+//    public static Attribute createFromEncoded(String unencodedKey, String encodedValue) {
+//        String value = Entities.unescape(encodedValue, true);
+//        return new Attribute(unencodedKey, value, null); // parent will get set when Put
+//    }
+
+    protected boolean isDataAttribute() {
+        return isDataAttribute(key);
+    }
+
+    protected static boolean isDataAttribute(String key) {
+        return key.startsWith(Attributes.dataPrefix) && key.length() > Attributes.dataPrefix.length();
+    }
+
+//    /**
+//     * Collapsible if it's a boolean attribute and value is empty or same as name
+//     *
+//     * @param out output settings
+//     * @return  Returns whether collapsible or not
+//     */
+//    protected final boolean shouldCollapseAttribute(Document.OutputSettings out) {
+//        return shouldCollapseAttribute(key, val, out);
+//    }
+
+//    protected static boolean shouldCollapseAttribute(final String key, final String val, final Document.OutputSettings out) {
+//        return (
+//                out.syntax() == Document.OutputSettings.Syntax.html &&
+//                        (val == null || ("".equals(val) || val.equalsIgnoreCase(key)) && Attribute.isBooleanAttribute(key)));
+//    }
+
+//    /**
+//     * @deprecated
+//     */
+//    protected boolean isBooleanAttribute() {
+//        return Arrays.binarySearch(booleanAttributes, key) >= 0 || val == null;
+//    }
+//
+//    /**
+//     * Checks if this attribute name is defined as a boolean attribute in HTML5
+//     */
+//    protected static boolean isBooleanAttribute(final String key) {
+//        return Arrays.binarySearch(booleanAttributes, key) >= 0;
+//    }
+
+    @Override
+    public boolean equals(Object o) { // note parent not considered
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        Attribute attribute = (Attribute) o;
+        if (key != null ? !key.equals(attribute.key) : attribute.key != null) return false;
+        return val != null ? val.equals(attribute.val) : attribute.val == null;
+    }
+
+    @Override
+    public int hashCode() { // note parent not considered
+        int result = key != null ? key.hashCode() : 0;
+        result = 31 * result + (val != null ? val.hashCode() : 0);
+        return result;
+    }
+
+    @Override
+    public Attribute clone() {
+        try {
+            return (Attribute) super.clone();
+        } catch (CloneNotSupportedException e) {
+            throw new RuntimeException(e);
+        }
+    }
+}
--- a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Attributes.java
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Attributes.java
@ -0,0 +1,444 @@
+package ru.noties.markwon.html.jsoup.nodes;
+
+import java.util.AbstractMap;
+import java.util.AbstractSet;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import ru.noties.markwon.html.jsoup.helper.Validate;
+
+import static ru.noties.markwon.html.jsoup.helper.Normalizer.lowerCase;
+
+/**
+ * The attributes of an Element.
+ * <p>
+ * Attributes are treated as a map: there can be only one value associated with an attribute key/name.
+ * </p>
+ * <p>
+ * Attribute name and value comparisons are  generally <b>case sensitive</b>. By default for HTML, attribute names are
+ * normalized to lower-case on parsing. That means you should use lower-case strings when referring to attributes by
+ * name.
+ * </p>
+ *
+ * @author Jonathan Hedley, jonathan@hedley.net
+ */
+public class Attributes implements Iterable<Attribute>, Cloneable {
+    protected static final String dataPrefix = "data-";
+    private static final int InitialCapacity = 4; // todo - analyze Alexa 1MM sites, determine best setting
+
+    // manages the key/val arrays
+    private static final int GrowthFactor = 2;
+    private static final String[] Empty = {};
+    static final int NotFound = -1;
+    private static final String EmptyString = "";
+
+    private int size = 0; // number of slots used (not capacity, which is keys.length
+    String[] keys = Empty;
+    String[] vals = Empty;
+
+    // check there's room for more
+    private void checkCapacity(int minNewSize) {
+        Validate.isTrue(minNewSize >= size);
+        int curSize = keys.length;
+        if (curSize >= minNewSize)
+            return;
+
+        int newSize = curSize >= InitialCapacity ? size * GrowthFactor : InitialCapacity;
+        if (minNewSize > newSize)
+            newSize = minNewSize;
+
+        keys = copyOf(keys, newSize);
+        vals = copyOf(vals, newSize);
+    }
+
+    // simple implementation of Arrays.copy, for support of Android API 8.
+    private static String[] copyOf(String[] orig, int size) {
+        final String[] copy = new String[size];
+        System.arraycopy(orig, 0, copy, 0,
+                Math.min(orig.length, size));
+        return copy;
+    }
+
+    int indexOfKey(String key) {
+        Validate.notNull(key);
+        for (int i = 0; i < size; i++) {
+            if (key.equals(keys[i]))
+                return i;
+        }
+        return NotFound;
+    }
+
+    private int indexOfKeyIgnoreCase(String key) {
+        Validate.notNull(key);
+        for (int i = 0; i < size; i++) {
+            if (key.equalsIgnoreCase(keys[i]))
+                return i;
+        }
+        return NotFound;
+    }
+
+    // we track boolean attributes as null in values - they're just keys. so returns empty for consumers
+    static String checkNotNull(String val) {
+        return val == null ? EmptyString : val;
+    }
+
+    /**
+     Get an attribute value by key.
+     @param key the (case-sensitive) attribute key
+     @return the attribute value if set; or empty string if not set (or a boolean attribute).
+     @see #hasKey(String)
+     */
+    public String get(String key) {
+        int i = indexOfKey(key);
+        return i == NotFound ? EmptyString : checkNotNull(vals[i]);
+    }
+
+    /**
+     * Get an attribute's value by case-insensitive key
+     * @param key the attribute name
+     * @return the first matching attribute value if set; or empty string if not set (ora boolean attribute).
+     */
+    public String getIgnoreCase(String key) {
+        int i = indexOfKeyIgnoreCase(key);
+        return i == NotFound ? EmptyString : checkNotNull(vals[i]);
+    }
+
+    // adds without checking if this key exists
+    private void add(String key, String value) {
+        checkCapacity(size + 1);
+        keys[size] = key;
+        vals[size] = value;
+        size++;
+    }
+
+    /**
+     * Set a new attribute, or replace an existing one by key.
+     * @param key case sensitive attribute key
+     * @param value attribute value
+     * @return these attributes, for chaining
+     */
+    public Attributes put(String key, String value) {
+        int i = indexOfKey(key);
+        if (i != NotFound)
+            vals[i] = value;
+        else
+            add(key, value);
+        return this;
+    }
+
+    void putIgnoreCase(String key, String value) {
+        int i = indexOfKeyIgnoreCase(key);
+        if (i != NotFound) {
+            vals[i] = value;
+            if (!keys[i].equals(key)) // case changed, update
+                keys[i] = key;
+        }
+        else
+            add(key, value);
+    }
+
+    /**
+     * Set a new boolean attribute, remove attribute if value is false.
+     * @param key case <b>insensitive</b> attribute key
+     * @param value attribute value
+     * @return these attributes, for chaining
+     */
+    public Attributes put(String key, boolean value) {
+        if (value)
+            putIgnoreCase(key, null);
+        else
+            remove(key);
+        return this;
+    }
+
+    /**
+     Set a new attribute, or replace an existing one by key.
+     @param attribute attribute with case sensitive key
+     @return these attributes, for chaining
+     */
+    public Attributes put(Attribute attribute) {
+        Validate.notNull(attribute);
+        put(attribute.getKey(), attribute.getValue());
+        attribute.parent = this;
+        return this;
+    }
+
+    // removes and shifts up
+    private void remove(int index) {
+        Validate.isFalse(index >= size);
+        int shifted = size - index - 1;
+        if (shifted > 0) {
+            System.arraycopy(keys, index + 1, keys, index, shifted);
+            System.arraycopy(vals, index + 1, vals, index, shifted);
+        }
+        size--;
+        keys[size] = null; // release hold
+        vals[size] = null;
+    }
+
+    /**
+     Remove an attribute by key. <b>Case sensitive.</b>
+     @param key attribute key to remove
+     */
+    public void remove(String key) {
+        int i = indexOfKey(key);
+        if (i != NotFound)
+            remove(i);
+    }
+
+    /**
+     Remove an attribute by key. <b>Case insensitive.</b>
+     @param key attribute key to remove
+     */
+    public void removeIgnoreCase(String key) {
+        int i = indexOfKeyIgnoreCase(key);
+        if (i != NotFound)
+            remove(i);
+    }
+
+    /**
+     Tests if these attributes contain an attribute with this key.
+     @param key case-sensitive key to check for
+     @return true if key exists, false otherwise
+     */
+    public boolean hasKey(String key) {
+        return indexOfKey(key) != NotFound;
+    }
+
+    /**
+     Tests if these attributes contain an attribute with this key.
+     @param key key to check for
+     @return true if key exists, false otherwise
+     */
+    public boolean hasKeyIgnoreCase(String key) {
+        return indexOfKeyIgnoreCase(key) != NotFound;
+    }
+
+    /**
+     Get the number of attributes in this set.
+     @return size
+     */
+    public int size() {
+        return size;
+    }
+
+    /**
+     Add all the attributes from the incoming set to this set.
+     @param incoming attributes to add to these attributes.
+     */
+    public void addAll(Attributes incoming) {
+        if (incoming.size() == 0)
+            return;
+        checkCapacity(size + incoming.size);
+
+        for (Attribute attr : incoming) {
+            // todo - should this be case insensitive?
+            put(attr);
+        }
+
+    }
+
+    public Iterator<Attribute> iterator() {
+        return new Iterator<Attribute>() {
+            int i = 0;
+
+            @Override
+            public boolean hasNext() {
+                return i < size;
+            }
+
+            @Override
+            public Attribute next() {
+                final Attribute attr = new Attribute(keys[i], vals[i], Attributes.this);
+                i++;
+                return attr;
+            }
+
+            @Override
+            public void remove() {
+                Attributes.this.remove(--i); // next() advanced, so rewind
+            }
+        };
+    }
+
+    /**
+     Get the attributes as a List, for iteration.
+     @return an view of the attributes as an unmodifialbe List.
+     */
+    public List<Attribute> asList() {
+        ArrayList<Attribute> list = new ArrayList<>(size);
+        for (int i = 0; i < size; i++) {
+//            Attribute attr = vals[i] == null ?
+//                    new BooleanAttribute(keys[i]) : // deprecated class, but maybe someone still wants it
+//                    new Attribute(keys[i], vals[i], Attributes.this);
+//            list.add(attr);
+            list.add(new Attribute(keys[i], vals[i], Attributes.this));
+        }
+        return Collections.unmodifiableList(list);
+    }
+
+    /**
+     * Retrieves a filtered view of attributes that are HTML5 custom data attributes; that is, attributes with keys
+     * starting with {@code data-}.
+     * @return map of custom data attributes.
+     */
+    public Map<String, String> dataset() {
+        return new Dataset(this);
+    }
+
+//    /**
+//     Get the HTML representation of these attributes.
+//     @return HTML
+//     @throws SerializationException if the HTML representation of the attributes cannot be constructed.
+//     */
+//    public String html() {
+//        StringBuilder accum = new StringBuilder();
+//        try {
+//            html(accum, (new Document("")).outputSettings()); // output settings a bit funky, but this html() seldom used
+//        } catch (IOException e) { // ought never happen
+//            throw new SerializationException(e);
+//        }
+//        return accum.toString();
+//    }
+//
+//    final void html(final Appendable accum, final Document.OutputSettings out) throws IOException {
+//        final int sz = size;
+//        for (int i = 0; i < sz; i++) {
+//            // inlined from Attribute.html()
+//            final String key = keys[i];
+//            final String val = vals[i];
+//            accum.append(' ').append(key);
+//
+//            // collapse checked=null, checked="", checked=checked; write out others
+//            if (!Attribute.shouldCollapseAttribute(key, val, out)) {
+//                accum.append("=\"");
+//                Entities.escape(accum, val == null ? EmptyString : val, out, true, false, false);
+//                accum.append('"');
+//            }
+//        }
+//    }
+//
+//    @Override
+//    public String toString() {
+//        return html();
+//    }
+
+    /**
+     * Checks if these attributes are equal to another set of attributes, by comparing the two sets
+     * @param o attributes to compare with
+     * @return if both sets of attributes have the same content
+     */
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+
+        Attributes that = (Attributes) o;
+
+        if (size != that.size) return false;
+        if (!Arrays.equals(keys, that.keys)) return false;
+        return Arrays.equals(vals, that.vals);
+    }
+
+    /**
+     * Calculates the hashcode of these attributes, by iterating all attributes and summing their hashcodes.
+     * @return calculated hashcode
+     */
+    @Override
+    public int hashCode() {
+        int result = size;
+        result = 31 * result + Arrays.hashCode(keys);
+        result = 31 * result + Arrays.hashCode(vals);
+        return result;
+    }
+
+    @Override
+    public Attributes clone() {
+        Attributes clone;
+        try {
+            clone = (Attributes) super.clone();
+        } catch (CloneNotSupportedException e) {
+            throw new RuntimeException(e);
+        }
+        clone.size = size;
+        keys = copyOf(keys, size);
+        vals = copyOf(vals, size);
+        return clone;
+    }
+
+    /**
+     * Internal method. Lowercases all keys.
+     */
+    public void normalize() {
+        for (int i = 0; i < size; i++) {
+            keys[i] = lowerCase(keys[i]);
+        }
+    }
+
+    private static class Dataset extends AbstractMap<String, String> {
+        private final Attributes attributes;
+
+        private Dataset(Attributes attributes) {
+            this.attributes = attributes;
+        }
+
+        @Override
+        public Set<Entry<String, String>> entrySet() {
+            return new EntrySet();
+        }
+
+        @Override
+        public String put(String key, String value) {
+            String dataKey = dataKey(key);
+            String oldValue = attributes.hasKey(dataKey) ? attributes.get(dataKey) : null;
+            attributes.put(dataKey, value);
+            return oldValue;
+        }
+
+        private class EntrySet extends AbstractSet<Map.Entry<String, String>> {
+
+            @Override
+            public Iterator<Map.Entry<String, String>> iterator() {
+                return new DatasetIterator();
+            }
+
+            @Override
+            public int size() {
+                int count = 0;
+                Iterator iter = new DatasetIterator();
+                while (iter.hasNext())
+                    count++;
+                return count;
+            }
+        }
+
+        private class DatasetIterator implements Iterator<Map.Entry<String, String>> {
+            private Iterator<Attribute> attrIter = attributes.iterator();
+            private Attribute attr;
+            public boolean hasNext() {
+                while (attrIter.hasNext()) {
+                    attr = attrIter.next();
+                    if (attr.isDataAttribute()) return true;
+                }
+                return false;
+            }
+
+            public Entry<String, String> next() {
+                return new Attribute(attr.getKey().substring(dataPrefix.length()), attr.getValue());
+            }
+
+            public void remove() {
+                attributes.remove(attr.getKey());
+            }
+        }
+    }
+
+    private static String dataKey(String key) {
+        return dataPrefix + key;
+    }
+}
--- a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/DocumentType.java
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/DocumentType.java
@ -0,0 +1,104 @@
+package ru.noties.markwon.html.jsoup.nodes;
+
+/**
+ * A {@code <!DOCTYPE>} node.
+ */
+public class DocumentType /*extends LeafNode*/ {
+    // todo needs a bit of a chunky cleanup. this level of detail isn't needed
+    public static final String PUBLIC_KEY = "PUBLIC";
+    public static final String SYSTEM_KEY = "SYSTEM";
+//    private static final String NAME = "name";
+//    private static final String PUB_SYS_KEY = "pubSysKey"; // PUBLIC or SYSTEM
+//    private static final String PUBLIC_ID = "publicId";
+//    private static final String SYSTEM_ID = "systemId";
+    // todo: quirk mode from publicId and systemId
+
+//    /**
+//     * Create a new doctype element.
+//     * @param name the doctype's name
+//     * @param publicId the doctype's public ID
+//     * @param systemId the doctype's system ID
+//     */
+//    public DocumentType(String name, String publicId, String systemId) {
+//        Validate.notNull(name);
+//        Validate.notNull(publicId);
+//        Validate.notNull(systemId);
+//        attr(NAME, name);
+//        attr(PUBLIC_ID, publicId);
+//        if (has(PUBLIC_ID)) {
+//            attr(PUB_SYS_KEY, PUBLIC_KEY);
+//        }
+//        attr(SYSTEM_ID, systemId);
+//    }
+//
+//    /**
+//     * Create a new doctype element.
+//     * @param name the doctype's name
+//     * @param publicId the doctype's public ID
+//     * @param systemId the doctype's system ID
+//     * @param baseUri unused
+//     * @deprecated
+//     */
+//    public DocumentType(String name, String publicId, String systemId, String baseUri) {
+//        attr(NAME, name);
+//        attr(PUBLIC_ID, publicId);
+//        if (has(PUBLIC_ID)) {
+//            attr(PUB_SYS_KEY, PUBLIC_KEY);
+//        }
+//        attr(SYSTEM_ID, systemId);
+//    }
+//
+//    /**
+//     * Create a new doctype element.
+//     * @param name the doctype's name
+//     * @param publicId the doctype's public ID
+//     * @param systemId the doctype's system ID
+//     * @param baseUri unused
+//     * @deprecated
+//     */
+//    public DocumentType(String name, String pubSysKey, String publicId, String systemId, String baseUri) {
+//        attr(NAME, name);
+//        if (pubSysKey != null) {
+//            attr(PUB_SYS_KEY, pubSysKey);
+//        }
+//        attr(PUBLIC_ID, publicId);
+//        attr(SYSTEM_ID, systemId);
+//    }
+//    public void setPubSysKey(String value) {
+//        if (value != null)
+//            attr(PUB_SYS_KEY, value);
+//    }
+//
+//    @Override
+//    public String nodeName() {
+//        return "#doctype";
+//    }
+//
+//    @Override
+//    void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
+//        if (out.syntax() == Syntax.html && !has(PUBLIC_ID) && !has(SYSTEM_ID)) {
+//            // looks like a html5 doctype, go lowercase for aesthetics
+//            accum.append("<!doctype");
+//        } else {
+//            accum.append("<!DOCTYPE");
+//        }
+//        if (has(NAME))
+//            accum.append(" ").append(attr(NAME));
+//        if (has(PUB_SYS_KEY))
+//            accum.append(" ").append(attr(PUB_SYS_KEY));
+//        if (has(PUBLIC_ID))
+//            accum.append(" \"").append(attr(PUBLIC_ID)).append('"');
+//        if (has(SYSTEM_ID))
+//            accum.append(" \"").append(attr(SYSTEM_ID)).append('"');
+//        accum.append('>');
+//    }
+//
+//    @Override
+//    void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) {
+//    }
+//
+//    private boolean has(final String attribute) {
+//        return !StringUtil.isBlank(attr(attribute));
+//    }
+}
+
--- a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Entities.java
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Entities.java
@ -0,0 +1,351 @@
+package ru.noties.markwon.html.jsoup.nodes;
+
+import java.nio.charset.CharsetEncoder;
+import java.util.Arrays;
+import java.util.HashMap;
+
+import ru.noties.markwon.html.jsoup.helper.Validate;
+import ru.noties.markwon.html.jsoup.parser.CharacterReader;
+
+import static ru.noties.markwon.html.jsoup.nodes.Entities.EscapeMode.base;
+import static ru.noties.markwon.html.jsoup.nodes.Entities.EscapeMode.extended;
+
+/**
+ * HTML entities, and escape routines. Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C
+ * HTML named character references</a>.
+ */
+public class Entities {
+    private static final int empty = -1;
+    private static final String emptyName = "";
+    static final int codepointRadix = 36;
+    private static final char[] codeDelims = {',', ';'};
+    private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references
+//    private static final Document.OutputSettings DefaultOutput = new Document.OutputSettings();
+
+    public enum EscapeMode {
+        /**
+         * Restricted entities suitable for XHTML output: lt, gt, amp, and quot only.
+         */
+        xhtml(EntitiesData.xmlPoints, 4),
+        /**
+         * Default HTML output entities.
+         */
+        base(EntitiesData.basePoints, 106),
+        /**
+         * Complete HTML entities.
+         */
+        extended(EntitiesData.fullPoints, 2125);
+
+        // table of named references to their codepoints. sorted so we can binary search. built by BuildEntities.
+        private String[] nameKeys;
+        private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints.
+
+        // table of codepoints to named entities.
+        private int[] codeKeys; // we don' support multicodepoints to single named value currently
+        private String[] nameVals;
+
+        EscapeMode(String file, int size) {
+            load(this, file, size);
+        }
+
+        int codepointForName(final String name) {
+            int index = Arrays.binarySearch(nameKeys, name);
+            return index >= 0 ? codeVals[index] : empty;
+        }
+
+        String nameForCodepoint(final int codepoint) {
+            final int index = Arrays.binarySearch(codeKeys, codepoint);
+            if (index >= 0) {
+                // the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower
+                // (and binary search for same item with multi results is undefined
+                return (index < nameVals.length - 1 && codeKeys[index + 1] == codepoint) ?
+                        nameVals[index + 1] : nameVals[index];
+            }
+            return emptyName;
+        }
+
+        private int size() {
+            return nameKeys.length;
+        }
+    }
+
+    private Entities() {
+    }
+
+    /**
+     * Check if the input is a known named entity
+     *
+     * @param name the possible entity name (e.g. "lt" or "amp")
+     * @return true if a known named entity
+     */
+    public static boolean isNamedEntity(final String name) {
+        return extended.codepointForName(name) != empty;
+    }
+
+    /**
+     * Check if the input is a known named entity in the base entity set.
+     *
+     * @param name the possible entity name (e.g. "lt" or "amp")
+     * @return true if a known named entity in the base set
+     * @see #isNamedEntity(String)
+     */
+    public static boolean isBaseNamedEntity(final String name) {
+        return base.codepointForName(name) != empty;
+    }
+
+    /**
+     * Get the Character value of the named entity
+     *
+     * @param name named entity (e.g. "lt" or "amp")
+     * @return the Character value of the named entity (e.g. '{@literal <}' or '{@literal &}')
+     * @deprecated does not support characters outside the BMP or multiple character names
+     */
+    public static Character getCharacterByName(String name) {
+        return (char) extended.codepointForName(name);
+    }
+
+    /**
+     * Get the character(s) represented by the named entity
+     *
+     * @param name entity (e.g. "lt" or "amp")
+     * @return the string value of the character(s) represented by this entity, or "" if not defined
+     */
+    public static String getByName(String name) {
+        String val = multipoints.get(name);
+        if (val != null)
+            return val;
+        int codepoint = extended.codepointForName(name);
+        if (codepoint != empty)
+            return new String(new int[]{codepoint}, 0, 1);
+        return emptyName;
+    }
+
+    public static int codepointsForName(final String name, final int[] codepoints) {
+        String val = multipoints.get(name);
+        if (val != null) {
+            codepoints[0] = val.codePointAt(0);
+            codepoints[1] = val.codePointAt(1);
+            return 2;
+        }
+        int codepoint = extended.codepointForName(name);
+        if (codepoint != empty) {
+            codepoints[0] = codepoint;
+            return 1;
+        }
+        return 0;
+    }
+
+//    /**
+//     * HTML escape an input string. That is, {@code <} is returned as {@code &lt;}
+//     *
+//     * @param string the un-escaped string to escape
+//     * @param out the output settings to use
+//     * @return the escaped string
+//     */
+//    public static String escape(String string, Document.OutputSettings out) {
+//        if (string == null)
+//            return "";
+//        StringBuilder accum = new StringBuilder(string.length() * 2);
+//        try {
+//            escape(accum, string, out, false, false, false);
+//        } catch (IOException e) {
+//            throw new SerializationException(e); // doesn't happen
+//        }
+//        return accum.toString();
+//    }
+
+//    /**
+//     * HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is returned as
+//     * {@code &lt;}
+//     *
+//     * @param string the un-escaped string to escape
+//     * @return the escaped string
+//     */
+//    public static String escape(String string) {
+//        return escape(string, DefaultOutput);
+//    }
+//
+//    // this method is ugly, and does a lot. but other breakups cause rescanning and stringbuilder generations
+//    static void escape(Appendable accum, String string, Document.OutputSettings out,
+//                       boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite) throws IOException {
+//
+//        boolean lastWasWhite = false;
+//        boolean reachedNonWhite = false;
+//        final EscapeMode escapeMode = out.escapeMode();
+//        final CharsetEncoder encoder = out.encoder();
+//        final CoreCharset coreCharset = out.coreCharset; // init in out.prepareEncoder()
+//        final int length = string.length();
+//
+//        int codePoint;
+//        for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) {
+//            codePoint = string.codePointAt(offset);
+//
+//            if (normaliseWhite) {
+//                if (StringUtil.isWhitespace(codePoint)) {
+//                    if ((stripLeadingWhite && !reachedNonWhite) || lastWasWhite)
+//                        continue;
+//                    accum.append(' ');
+//                    lastWasWhite = true;
+//                    continue;
+//                } else {
+//                    lastWasWhite = false;
+//                    reachedNonWhite = true;
+//                }
+//            }
+//            // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
+//            if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
+//                final char c = (char) codePoint;
+//                // html specific and required escapes:
+//                switch (c) {
+//                    case '&':
+//                        accum.append("&amp;");
+//                        break;
+//                    case 0xA0:
+//                        if (escapeMode != EscapeMode.xhtml)
+//                            accum.append("&nbsp;");
+//                        else
+//                            accum.append("&#xa0;");
+//                        break;
+//                    case '<':
+//                        // escape when in character data or when in a xml attribue val; not needed in html attr val
+//                        if (!inAttribute || escapeMode == EscapeMode.xhtml)
+//                            accum.append("&lt;");
+//                        else
+//                            accum.append(c);
+//                        break;
+//                    case '>':
+//                        if (!inAttribute)
+//                            accum.append("&gt;");
+//                        else
+//                            accum.append(c);
+//                        break;
+//                    case '"':
+//                        if (inAttribute)
+//                            accum.append("&quot;");
+//                        else
+//                            accum.append(c);
+//                        break;
+//                    default:
+//                        if (canEncode(coreCharset, c, encoder))
+//                            accum.append(c);
+//                        else
+//                            appendEncoded(accum, escapeMode, codePoint);
+//                }
+//            } else {
+//                final String c = new String(Character.toChars(codePoint));
+//                if (encoder.canEncode(c)) // uses fallback encoder for simplicity
+//                    accum.append(c);
+//                else
+//                    appendEncoded(accum, escapeMode, codePoint);
+//            }
+//        }
+//    }
+
+//    private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException {
+//        final String name = escapeMode.nameForCodepoint(codePoint);
+//        if (name != emptyName) // ok for identity check
+//            accum.append('&').append(name).append(';');
+//        else
+//            accum.append("&#x").append(Integer.toHexString(codePoint)).append(';');
+//    }
+
+//    /**
+//     * Un-escape an HTML escaped string. That is, {@code &lt;} is returned as {@code <}.
+//     *
+//     * @param string the HTML string to un-escape
+//     * @return the unescaped string
+//     */
+//    public static String unescape(String string) {
+//        return unescape(string, false);
+//    }
+
+//    /**
+//     * Unescape the input string.
+//     *
+//     * @param string to un-HTML-escape
+//     * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional)
+//     * @return unescaped string
+//     */
+//    static String unescape(String string, boolean strict) {
+//        return Parser.unescapeEntities(string, strict);
+//    }
+
+    /*
+     * Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean.
+     * After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF,
+     * performance may be bad. We can add more encoders for common character sets that are impacted by performance
+     * issues on Android if required.
+     *
+     * Benchmarks:     *
+     * OLD toHtml() impl v New (fastpath) in millis
+     * Wiki: 1895, 16
+     * CNN: 6378, 55
+     * Alterslash: 3013, 28
+     * Jsoup: 167, 2
+     */
+    private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) {
+        // todo add more charset tests if impacted by Android's bad perf in canEncode
+        switch (charset) {
+            case ascii:
+                return c < 0x80;
+            case utf:
+                return true; // real is:!(Character.isLowSurrogate(c) || Character.isHighSurrogate(c)); - but already check above
+            default:
+                return fallback.canEncode(c);
+        }
+    }
+
+    enum CoreCharset {
+        ascii, utf, fallback;
+
+        static CoreCharset byName(final String name) {
+            if (name.equals("US-ASCII"))
+                return ascii;
+            if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al
+                return utf;
+            return fallback;
+        }
+    }
+
+    private static void load(EscapeMode e, String pointsData, int size) {
+        e.nameKeys = new String[size];
+        e.codeVals = new int[size];
+        e.codeKeys = new int[size];
+        e.nameVals = new String[size];
+
+        int i = 0;
+        CharacterReader reader = new CharacterReader(pointsData);
+
+        while (!reader.isEmpty()) {
+            // NotNestedLessLess=10913,824;1887&
+
+            final String name = reader.consumeTo('=');
+            reader.advance();
+            final int cp1 = Integer.parseInt(reader.consumeToAny(codeDelims), codepointRadix);
+            final char codeDelim = reader.current();
+            reader.advance();
+            final int cp2;
+            if (codeDelim == ',') {
+                cp2 = Integer.parseInt(reader.consumeTo(';'), codepointRadix);
+                reader.advance();
+            } else {
+                cp2 = empty;
+            }
+            final String indexS = reader.consumeTo('&');
+            final int index = Integer.parseInt(indexS, codepointRadix);
+            reader.advance();
+
+            e.nameKeys[i] = name;
+            e.codeVals[i] = cp1;
+            e.codeKeys[index] = cp1;
+            e.nameVals[index] = name;
+
+            if (cp2 != empty) {
+                multipoints.put(name, new String(new int[]{cp1, cp2}, 0, 2));
+            }
+            i++;
+        }
+
+        Validate.isTrue(i == size, "Unexpected count of entities loaded");
+    }
+}
--- a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/EntitiesData.java
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/EntitiesData.java
--- a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/CharacterReader.java
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/CharacterReader.java
@ -0,0 +1,483 @@
+package ru.noties.markwon.html.jsoup.parser;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Arrays;
+import java.util.Locale;
+
+import ru.noties.markwon.html.jsoup.UncheckedIOException;
+import ru.noties.markwon.html.jsoup.helper.Validate;
+
+/**
+ CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes.
+ */
+public final class CharacterReader {
+    static final char EOF = (char) -1;
+    private static final int maxStringCacheLen = 12;
+    static final int maxBufferLen = 1024 * 32; // visible for testing
+    private static final int readAheadLimit = (int) (maxBufferLen * 0.75);
+
+    private final char[] charBuf;
+    private final Reader reader;
+    private int bufLength;
+    private int bufSplitPoint;
+    private int bufPos;
+    private int readerPos;
+    private int bufMark;
+    private final String[] stringCache = new String[512]; // holds reused strings in this doc, to lessen garbage
+
+    public CharacterReader(Reader input, int sz) {
+        Validate.notNull(input);
+        Validate.isTrue(input.markSupported());
+        reader = input;
+        charBuf = new char[sz > maxBufferLen ? maxBufferLen : sz];
+        bufferUp();
+    }
+
+    public CharacterReader(Reader input) {
+        this(input, maxBufferLen);
+    }
+
+    public CharacterReader(String input) {
+        this(new StringReader(input), input.length());
+    }
+
+    private void bufferUp() {
+        if (bufPos < bufSplitPoint)
+            return;
+
+        try {
+            reader.skip(bufPos);
+            reader.mark(maxBufferLen);
+            final int read = reader.read(charBuf);
+            reader.reset();
+            if (read != -1) {
+                bufLength = read;
+                readerPos += bufPos;
+                bufPos = 0;
+                bufMark = 0;
+                bufSplitPoint = bufLength > readAheadLimit ? readAheadLimit : bufLength;
+            }
+        } catch (IOException e) {
+            throw new UncheckedIOException(e);
+        }
+    }
+
+    /**
+     * Gets the current cursor position in the content.
+     * @return current position
+     */
+    public int pos() {
+        return readerPos + bufPos;
+    }
+
+    /**
+     * Tests if all the content has been read.
+     * @return true if nothing left to read.
+     */
+    public boolean isEmpty() {
+        bufferUp();
+        return bufPos >= bufLength;
+    }
+
+    private boolean isEmptyNoBufferUp() {
+        return bufPos >= bufLength;
+    }
+
+    /**
+     * Get the char at the current position.
+     * @return char
+     */
+    public char current() {
+        bufferUp();
+        return isEmptyNoBufferUp() ? EOF : charBuf[bufPos];
+    }
+
+    char consume() {
+        bufferUp();
+        char val = isEmptyNoBufferUp() ? EOF : charBuf[bufPos];
+        bufPos++;
+        return val;
+    }
+
+    void unconsume() {
+        bufPos--;
+    }
+
+    /**
+     * Moves the current position by one.
+     */
+    public void advance() {
+        bufPos++;
+    }
+
+    void mark() {
+        bufMark = bufPos;
+    }
+
+    void rewindToMark() {
+        bufPos = bufMark;
+    }
+
+    /**
+     * Returns the number of characters between the current position and the next instance of the input char
+     * @param c scan target
+     * @return offset between current position and next instance of target. -1 if not found.
+     */
+    int nextIndexOf(char c) {
+        // doesn't handle scanning for surrogates
+        bufferUp();
+        for (int i = bufPos; i < bufLength; i++) {
+            if (c == charBuf[i])
+                return i - bufPos;
+        }
+        return -1;
+    }
+
+    /**
+     * Returns the number of characters between the current position and the next instance of the input sequence
+     *
+     * @param seq scan target
+     * @return offset between current position and next instance of target. -1 if not found.
+     */
+    int nextIndexOf(CharSequence seq) {
+        bufferUp();
+        // doesn't handle scanning for surrogates
+        char startChar = seq.charAt(0);
+        for (int offset = bufPos; offset < bufLength; offset++) {
+            // scan to first instance of startchar:
+            if (startChar != charBuf[offset])
+                while(++offset < bufLength && startChar != charBuf[offset]) { /* empty */ }
+            int i = offset + 1;
+            int last = i + seq.length()-1;
+            if (offset < bufLength && last <= bufLength) {
+                for (int j = 1; i < last && seq.charAt(j) == charBuf[i]; i++, j++) { /* empty */ }
+                if (i == last) // found full sequence
+                    return offset - bufPos;
+            }
+        }
+        return -1;
+    }
+
+    /**
+     * Reads characters up to the specific char.
+     * @param c the delimiter
+     * @return the chars read
+     */
+    public String consumeTo(char c) {
+        int offset = nextIndexOf(c);
+        if (offset != -1) {
+            String consumed = cacheString(charBuf, stringCache, bufPos, offset);
+            bufPos += offset;
+            return consumed;
+        } else {
+            return consumeToEnd();
+        }
+    }
+
+    String consumeTo(String seq) {
+        int offset = nextIndexOf(seq);
+        if (offset != -1) {
+            String consumed = cacheString(charBuf, stringCache, bufPos, offset);
+            bufPos += offset;
+            return consumed;
+        } else {
+            return consumeToEnd();
+        }
+    }
+
+    /**
+     * Read characters until the first of any delimiters is found.
+     * @param chars delimiters to scan for
+     * @return characters read up to the matched delimiter.
+     */
+    public String consumeToAny(final char... chars) {
+        bufferUp();
+        final int start = bufPos;
+        final int remaining = bufLength;
+        final char[] val = charBuf;
+
+        OUTER: while (bufPos < remaining) {
+            for (char c : chars) {
+                if (val[bufPos] == c)
+                    break OUTER;
+            }
+            bufPos++;
+        }
+
+        return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
+    }
+
+    String consumeToAnySorted(final char... chars) {
+        bufferUp();
+        final int start = bufPos;
+        final int remaining = bufLength;
+        final char[] val = charBuf;
+
+        while (bufPos < remaining) {
+            if (Arrays.binarySearch(chars, val[bufPos]) >= 0)
+                break;
+            bufPos++;
+        }
+
+        return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
+    }
+
+    String consumeData() {
+        // &, <, null
+        bufferUp();
+        final int start = bufPos;
+        final int remaining = bufLength;
+        final char[] val = charBuf;
+
+        while (bufPos < remaining) {
+            final char c = val[bufPos];
+            if (c == '&'|| c ==  '<' || c ==  TokeniserState.nullChar)
+                break;
+            bufPos++;
+        }
+
+        return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
+    }
+
+    String consumeTagName() {
+        // '\t', '\n', '\r', '\f', ' ', '/', '>', nullChar
+        bufferUp();
+        final int start = bufPos;
+        final int remaining = bufLength;
+        final char[] val = charBuf;
+
+        while (bufPos < remaining) {
+            final char c = val[bufPos];
+            if (c == '\t'|| c ==  '\n'|| c ==  '\r'|| c ==  '\f'|| c ==  ' '|| c ==  '/'|| c ==  '>'|| c ==  TokeniserState.nullChar)
+                break;
+            bufPos++;
+        }
+
+        return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
+    }
+
+    String consumeToEnd() {
+        bufferUp();
+        String data = cacheString(charBuf, stringCache, bufPos, bufLength - bufPos);
+        bufPos = bufLength;
+        return data;
+    }
+
+    String consumeLetterSequence() {
+        bufferUp();
+        int start = bufPos;
+        while (bufPos < bufLength) {
+            char c = charBuf[bufPos];
+            if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c))
+                bufPos++;
+            else
+                break;
+        }
+
+        return cacheString(charBuf, stringCache, start, bufPos - start);
+    }
+
+    String consumeLetterThenDigitSequence() {
+        bufferUp();
+        int start = bufPos;
+        while (bufPos < bufLength) {
+            char c = charBuf[bufPos];
+            if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c))
+                bufPos++;
+            else
+                break;
+        }
+        while (!isEmptyNoBufferUp()) {
+            char c = charBuf[bufPos];
+            if (c >= '0' && c <= '9')
+                bufPos++;
+            else
+                break;
+        }
+
+        return cacheString(charBuf, stringCache, start, bufPos - start);
+    }
+
+    String consumeHexSequence() {
+        bufferUp();
+        int start = bufPos;
+        while (bufPos < bufLength) {
+            char c = charBuf[bufPos];
+            if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'))
+                bufPos++;
+            else
+                break;
+        }
+        return cacheString(charBuf, stringCache, start, bufPos - start);
+    }
+
+    String consumeDigitSequence() {
+        bufferUp();
+        int start = bufPos;
+        while (bufPos < bufLength) {
+            char c = charBuf[bufPos];
+            if (c >= '0' && c <= '9')
+                bufPos++;
+            else
+                break;
+        }
+        return cacheString(charBuf, stringCache, start, bufPos - start);
+    }
+
+    boolean matches(char c) {
+        return !isEmpty() && charBuf[bufPos] == c;
+
+    }
+
+    boolean matches(String seq) {
+        bufferUp();
+        int scanLength = seq.length();
+        if (scanLength > bufLength - bufPos)
+            return false;
+
+        for (int offset = 0; offset < scanLength; offset++)
+            if (seq.charAt(offset) != charBuf[bufPos +offset])
+                return false;
+        return true;
+    }
+
+    boolean matchesIgnoreCase(String seq) {
+        bufferUp();
+        int scanLength = seq.length();
+        if (scanLength > bufLength - bufPos)
+            return false;
+
+        for (int offset = 0; offset < scanLength; offset++) {
+            char upScan = Character.toUpperCase(seq.charAt(offset));
+            char upTarget = Character.toUpperCase(charBuf[bufPos + offset]);
+            if (upScan != upTarget)
+                return false;
+        }
+        return true;
+    }
+
+    boolean matchesAny(char... seq) {
+        if (isEmpty())
+            return false;
+
+        bufferUp();
+        char c = charBuf[bufPos];
+        for (char seek : seq) {
+            if (seek == c)
+                return true;
+        }
+        return false;
+    }
+
+    boolean matchesAnySorted(char[] seq) {
+        bufferUp();
+        return !isEmpty() && Arrays.binarySearch(seq, charBuf[bufPos]) >= 0;
+    }
+
+    boolean matchesLetter() {
+        if (isEmpty())
+            return false;
+        char c = charBuf[bufPos];
+        return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c);
+    }
+
+    boolean matchesDigit() {
+        if (isEmpty())
+            return false;
+        char c = charBuf[bufPos];
+        return (c >= '0' && c <= '9');
+    }
+
+    boolean matchConsume(String seq) {
+        bufferUp();
+        if (matches(seq)) {
+            bufPos += seq.length();
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    boolean matchConsumeIgnoreCase(String seq) {
+        if (matchesIgnoreCase(seq)) {
+            bufPos += seq.length();
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    boolean containsIgnoreCase(String seq) {
+        // used to check presence of </title>, </style>. only finds consistent case.
+        String loScan = seq.toLowerCase(Locale.ENGLISH);
+        String hiScan = seq.toUpperCase(Locale.ENGLISH);
+        return (nextIndexOf(loScan) > -1) || (nextIndexOf(hiScan) > -1);
+    }
+
+    @Override
+    public String toString() {
+        return new String(charBuf, bufPos, bufLength - bufPos);
+    }
+
+    /**
+     * Caches short strings, as a flywheel pattern, to reduce GC load. Just for this doc, to prevent leaks.
+     * <p />
+     * Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list.
+     * That saves both having to create objects as hash keys, and running through the entry list, at the expense of
+     * some more duplicates.
+     */
+    private static String cacheString(final char[] charBuf, final String[] stringCache, final int start, final int count) {
+        // limit (no cache):
+        if (count > maxStringCacheLen)
+            return new String(charBuf, start, count);
+        if (count < 1)
+            return "";
+
+        // calculate hash:
+        int hash = 0;
+        int offset = start;
+        for (int i = 0; i < count; i++) {
+            hash = 31 * hash + charBuf[offset++];
+        }
+
+        // get from cache
+        final int index = hash & stringCache.length - 1;
+        String cached = stringCache[index];
+
+        if (cached == null) { // miss, add
+            cached = new String(charBuf, start, count);
+            stringCache[index] = cached;
+        } else { // hashcode hit, check equality
+            if (rangeEquals(charBuf, start, count, cached)) { // hit
+                return cached;
+            } else { // hashcode conflict
+                cached = new String(charBuf, start, count);
+                stringCache[index] = cached; // update the cache, as recently used strings are more likely to show up again
+            }
+        }
+        return cached;
+    }
+
+    /**
+     * Check if the value of the provided range equals the string.
+     */
+    static boolean rangeEquals(final char[] charBuf, final int start, int count, final String cached) {
+        if (count == cached.length()) {
+            int i = start;
+            int j = 0;
+            while (count-- != 0) {
+                if (charBuf[i++] != cached.charAt(j++))
+                    return false;
+            }
+            return true;
+        }
+        return false;
+    }
+
+    // just used for testing
+    boolean rangeEquals(final int start, final int count, final String cached) {
+        return rangeEquals(charBuf, start, count, cached);
+    }
+}
--- a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/ParseError.java
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/ParseError.java
@ -0,0 +1,41 @@
+package ru.noties.markwon.html.jsoup.parser;
+
+/**
+ * A Parse Error records an error in the input HTML that occurs in either the tokenisation or the tree building phase.
+ */
+public class ParseError {
+    private int pos;
+    private String errorMsg;
+
+    ParseError(int pos, String errorMsg) {
+        this.pos = pos;
+        this.errorMsg = errorMsg;
+    }
+
+    ParseError(int pos, String errorFormat, Object... args) {
+        this.errorMsg = String.format(errorFormat, args);
+        this.pos = pos;
+    }
+
+    /**
+     * Retrieve the error message.
+     * @return the error message.
+     */
+    public String getErrorMessage() {
+        return errorMsg;
+    }
+
+    /**
+     * Retrieves the offset of the error.
+     * @return error offset within input
+     */
+    public int getPosition() {
+        return pos;
+    }
+
+    @Override
+    public String toString() {
+        return pos + ": " + errorMsg;
+    }
+}
+
--- a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/ParseErrorList.java
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/ParseErrorList.java
@ -0,0 +1,34 @@
+package ru.noties.markwon.html.jsoup.parser;
+
+import java.util.ArrayList;
+
+/**
+ * A container for ParseErrors.
+ *
+ * @author Jonathan Hedley
+ */
+public class ParseErrorList extends ArrayList<ParseError>{
+    private static final int INITIAL_CAPACITY = 16;
+    private final int maxSize;
+
+    ParseErrorList(int initialCapacity, int maxSize) {
+        super(initialCapacity);
+        this.maxSize = maxSize;
+    }
+
+    boolean canAddError() {
+        return size() < maxSize;
+    }
+
+    int getMaxSize() {
+        return maxSize;
+    }
+
+    public static ParseErrorList noTracking() {
+        return new ParseErrorList(0, 0);
+    }
+
+    public static ParseErrorList tracking(int maxSize) {
+        return new ParseErrorList(INITIAL_CAPACITY, maxSize);
+    }
+}
--- a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/Token.java
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/Token.java
@ -0,0 +1,398 @@
+package ru.noties.markwon.html.jsoup.parser;
+
+import android.support.annotation.NonNull;
+
+import ru.noties.markwon.html.jsoup.helper.Validate;
+import ru.noties.markwon.html.jsoup.nodes.Attributes;
+
+import static ru.noties.markwon.html.jsoup.helper.Normalizer.lowerCase;
+
+/**
+ * Parse tokens for the Tokeniser.
+ */
+public abstract class Token {
+
+    public final TokenType type;
+
+    protected Token(@NonNull TokenType tokenType) {
+        this.type = tokenType;
+    }
+
+//    String tokenType() {
+//        return this.getClass().getSimpleName();
+//    }
+
+    /**
+     * Reset the data represent by this token, for reuse. Prevents the need to create transfer objects for every
+     * piece of data, which immediately get GCed.
+     */
+    public abstract Token reset();
+
+    static void reset(StringBuilder sb) {
+        if (sb != null) {
+            sb.delete(0, sb.length());
+        }
+    }
+
+    public static final class Doctype extends Token {
+        final StringBuilder name = new StringBuilder();
+        String pubSysKey = null;
+        final StringBuilder publicIdentifier = new StringBuilder();
+        final StringBuilder systemIdentifier = new StringBuilder();
+        boolean forceQuirks = false;
+
+        Doctype() {
+            super(TokenType.Doctype);
+        }
+
+        @Override
+        public Token reset() {
+            reset(name);
+            pubSysKey = null;
+            reset(publicIdentifier);
+            reset(systemIdentifier);
+            forceQuirks = false;
+            return this;
+        }
+
+        String getName() {
+            return name.toString();
+        }
+
+        String getPubSysKey() {
+            return pubSysKey;
+        }
+
+        String getPublicIdentifier() {
+            return publicIdentifier.toString();
+        }
+
+        public String getSystemIdentifier() {
+            return systemIdentifier.toString();
+        }
+
+        public boolean isForceQuirks() {
+            return forceQuirks;
+        }
+    }
+
+    public static abstract class Tag extends Token {
+
+        public String tagName;
+        public String normalName; // lc version of tag name, for case insensitive tree build
+        private String pendingAttributeName; // attribute names are generally caught in one hop, not accumulated
+        private StringBuilder pendingAttributeValue = new StringBuilder(); // but values are accumulated, from e.g. & in hrefs
+        private String pendingAttributeValueS; // try to get attr vals in one shot, vs Builder
+        private boolean hasEmptyAttributeValue = false; // distinguish boolean attribute from empty string value
+        private boolean hasPendingAttributeValue = false;
+        public boolean selfClosing = false;
+        public Attributes attributes; // start tags get attributes on construction. End tags get attributes on first new attribute (but only for parser convenience, not used).
+
+        protected Tag(@NonNull TokenType tokenType) {
+            super(tokenType);
+        }
+
+        @Override
+        public Tag reset() {
+            tagName = null;
+            normalName = null;
+            pendingAttributeName = null;
+            reset(pendingAttributeValue);
+            pendingAttributeValueS = null;
+            hasEmptyAttributeValue = false;
+            hasPendingAttributeValue = false;
+            selfClosing = false;
+            attributes = null;
+            return this;
+        }
+
+        final void newAttribute() {
+            if (attributes == null)
+                attributes = new Attributes();
+
+            if (pendingAttributeName != null) {
+                // the tokeniser has skipped whitespace control chars, but trimming could collapse to empty for other control codes, so verify here
+                pendingAttributeName = pendingAttributeName.trim();
+                if (pendingAttributeName.length() > 0) {
+                    String value;
+                    if (hasPendingAttributeValue)
+                        value = pendingAttributeValue.length() > 0 ? pendingAttributeValue.toString() : pendingAttributeValueS;
+                    else if (hasEmptyAttributeValue)
+                        value = "";
+                    else
+                        value = null;
+                    attributes.put(pendingAttributeName, value);
+                }
+            }
+            pendingAttributeName = null;
+            hasEmptyAttributeValue = false;
+            hasPendingAttributeValue = false;
+            reset(pendingAttributeValue);
+            pendingAttributeValueS = null;
+        }
+
+        final void finaliseTag() {
+            // finalises for emit
+            if (pendingAttributeName != null) {
+                // todo: check if attribute name exists; if so, drop and error
+                newAttribute();
+            }
+        }
+
+        final String name() { // preserves case, for input into Tag.valueOf (which may drop case)
+            Validate.isFalse(tagName == null || tagName.length() == 0);
+            return tagName;
+        }
+
+        final String normalName() { // loses case, used in tree building for working out where in tree it should go
+            return normalName;
+        }
+
+        final Tag name(String name) {
+            tagName = name;
+            normalName = lowerCase(name);
+            return this;
+        }
+
+        final boolean isSelfClosing() {
+            return selfClosing;
+        }
+
+        @SuppressWarnings({"TypeMayBeWeakened"})
+        final Attributes getAttributes() {
+            return attributes;
+        }
+
+        // these appenders are rarely hit in not null state-- caused by null chars.
+        final void appendTagName(String append) {
+            tagName = tagName == null ? append : tagName.concat(append);
+            normalName = lowerCase(tagName);
+        }
+
+        final void appendTagName(char append) {
+            appendTagName(String.valueOf(append));
+        }
+
+        final void appendAttributeName(String append) {
+            pendingAttributeName = pendingAttributeName == null ? append : pendingAttributeName.concat(append);
+        }
+
+        final void appendAttributeName(char append) {
+            appendAttributeName(String.valueOf(append));
+        }
+
+        final void appendAttributeValue(String append) {
+            ensureAttributeValue();
+            if (pendingAttributeValue.length() == 0) {
+                pendingAttributeValueS = append;
+            } else {
+                pendingAttributeValue.append(append);
+            }
+        }
+
+        final void appendAttributeValue(char append) {
+            ensureAttributeValue();
+            pendingAttributeValue.append(append);
+        }
+
+        final void appendAttributeValue(char[] append) {
+            ensureAttributeValue();
+            pendingAttributeValue.append(append);
+        }
+
+        final void appendAttributeValue(int[] appendCodepoints) {
+            ensureAttributeValue();
+            for (int codepoint : appendCodepoints) {
+                pendingAttributeValue.appendCodePoint(codepoint);
+            }
+        }
+
+        final void setEmptyAttributeValue() {
+            hasEmptyAttributeValue = true;
+        }
+
+        private void ensureAttributeValue() {
+            hasPendingAttributeValue = true;
+            // if on second hit, we'll need to move to the builder
+            if (pendingAttributeValueS != null) {
+                pendingAttributeValue.append(pendingAttributeValueS);
+                pendingAttributeValueS = null;
+            }
+        }
+    }
+
+    public final static class StartTag extends Tag {
+        StartTag() {
+            super(TokenType.StartTag);
+            attributes = new Attributes();
+        }
+
+        @Override
+        public Tag reset() {
+            super.reset();
+            attributes = new Attributes();
+            // todo - would prefer these to be null, but need to check Element assertions
+            return this;
+        }
+
+        StartTag nameAttr(String name, Attributes attributes) {
+            this.tagName = name;
+            this.attributes = attributes;
+            normalName = lowerCase(tagName);
+            return this;
+        }
+
+        @Override
+        public String toString() {
+            if (attributes != null && attributes.size() > 0)
+                return "<" + name() + " " + attributes.toString() + ">";
+            else
+                return "<" + name() + ">";
+        }
+    }
+
+    public final static class EndTag extends Tag{
+        EndTag() {
+            super(TokenType.EndTag);
+        }
+
+        @Override
+        public String toString() {
+            return "</" + name() + ">";
+        }
+    }
+
+    public final static class Comment extends Token {
+        final StringBuilder data = new StringBuilder();
+        boolean bogus = false;
+
+        @Override
+        public Token reset() {
+            reset(data);
+            bogus = false;
+            return this;
+        }
+
+        Comment() {
+            super(TokenType.Comment);
+        }
+
+        String getData() {
+            return data.toString();
+        }
+
+        @Override
+        public String toString() {
+            return "<!--" + getData() + "-->";
+        }
+    }
+
+    public static class Character extends Token {
+        private String data;
+
+        Character() {
+            super(TokenType.Character);
+        }
+
+        @Override
+        public Token reset() {
+            data = null;
+            return this;
+        }
+
+        Character data(String data) {
+            this.data = data;
+            return this;
+        }
+
+        public String getData() {
+            return data;
+        }
+
+        @Override
+        public String toString() {
+            return getData();
+        }
+    }
+
+    public final static class CData extends Character {
+        CData(String data) {
+            super();
+            this.data(data);
+        }
+
+        @Override
+        public String toString() {
+            return "<![CDATA[" + getData() + "]]>";
+        }
+
+    }
+
+    public final static class EOF extends Token {
+        EOF() {
+            super(Token.TokenType.EOF);
+        }
+
+        @Override
+        public Token reset() {
+            return this;
+        }
+    }
+
+//    final boolean isDoctype() {
+//        return type == TokenType.Doctype;
+//    }
+//
+//    final Doctype asDoctype() {
+//        return (Doctype) this;
+//    }
+//
+//    final boolean isStartTag() {
+//        return type == TokenType.StartTag;
+//    }
+//
+//    final StartTag asStartTag() {
+//        return (StartTag) this;
+//    }
+//
+//    final boolean isEndTag() {
+//        return type == TokenType.EndTag;
+//    }
+//
+//    final EndTag asEndTag() {
+//        return (EndTag) this;
+//    }
+//
+//    final boolean isComment() {
+//        return type == TokenType.Comment;
+//    }
+//
+//    final Comment asComment() {
+//        return (Comment) this;
+//    }
+//
+//    final boolean isCharacter() {
+//        return type == TokenType.Character;
+//    }
+//
+//    final boolean isCData() {
+//        return this instanceof CData;
+//    }
+//
+//    final Character asCharacter() {
+//        return (Character) this;
+//    }
+//
+//    final boolean isEOF() {
+//        return type == TokenType.EOF;
+//    }
+
+    public enum TokenType {
+        Doctype,
+        StartTag,
+        EndTag,
+        Comment,
+        Character, // note no CData - treated in builder as an extension of Character
+        EOF
+    }
+}
--- a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/Tokeniser.java
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/Tokeniser.java
@ -0,0 +1,295 @@
+package ru.noties.markwon.html.jsoup.parser;
+
+import java.util.Arrays;
+
+import ru.noties.markwon.html.jsoup.helper.Validate;
+import ru.noties.markwon.html.jsoup.nodes.Entities;
+
+/**
+ * Readers the input stream into tokens.
+ */
+public final class Tokeniser {
+    static final char replacementChar = '\uFFFD'; // replaces null character
+    private static final char[] notCharRefCharsSorted = new char[]{'\t', '\n', '\r', '\f', ' ', '<', '&'};
+
+    // Some illegal character escapes are parsed by browsers as windows-1252 instead. See issue #1034
+    // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
+    static final int win1252ExtensionsStart = 0x80;
+    static final int[] win1252Extensions = new int[] {
+            // we could build this manually, but Windows-1252 is not a standard java charset so that could break on
+            // some platforms - this table is verified with a test
+            0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
+            0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
+            0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+            0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178,
+    };
+
+    static {
+        Arrays.sort(notCharRefCharsSorted);
+    }
+
+    private final CharacterReader reader; // html input
+    private final ParseErrorList errors; // errors found while tokenising
+
+    private TokeniserState state = TokeniserState.Data; // current tokenisation state
+    private Token emitPending; // the token we are about to emit on next read
+    private boolean isEmitPending = false;
+    private String charsString = null; // characters pending an emit. Will fall to charsBuilder if more than one
+    private StringBuilder charsBuilder = new StringBuilder(1024); // buffers characters to output as one token, if more than one emit per read
+    StringBuilder dataBuffer = new StringBuilder(1024); // buffers data looking for </script>
+
+    Token.Tag tagPending; // tag we are building up
+    Token.StartTag startPending = new Token.StartTag();
+    Token.EndTag endPending = new Token.EndTag();
+    Token.Character charPending = new Token.Character();
+    Token.Doctype doctypePending = new Token.Doctype(); // doctype building up
+    Token.Comment commentPending = new Token.Comment(); // comment building up
+    private String lastStartTag; // the last start tag emitted, to test appropriate end tag
+
+    public Tokeniser(CharacterReader reader, ParseErrorList errors) {
+        this.reader = reader;
+        this.errors = errors;
+    }
+
+    public Token read() {
+        while (!isEmitPending)
+            state.read(this, reader);
+
+        // if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read:
+        if (charsBuilder.length() > 0) {
+            String str = charsBuilder.toString();
+            charsBuilder.delete(0, charsBuilder.length());
+            charsString = null;
+            return charPending.data(str);
+        } else if (charsString != null) {
+            Token token = charPending.data(charsString);
+            charsString = null;
+            return token;
+        } else {
+            isEmitPending = false;
+            return emitPending;
+        }
+    }
+
+    void emit(Token token) {
+        Validate.isFalse(isEmitPending, "There is an unread token pending!");
+
+        emitPending = token;
+        isEmitPending = true;
+
+        if (token.type == Token.TokenType.StartTag) {
+            Token.StartTag startTag = (Token.StartTag) token;
+            lastStartTag = startTag.tagName;
+        } else if (token.type == Token.TokenType.EndTag) {
+            Token.EndTag endTag = (Token.EndTag) token;
+            if (endTag.attributes != null)
+                error("Attributes incorrectly present on end tag");
+        }
+    }
+
+    void emit(final String str) {
+        // buffer strings up until last string token found, to emit only one token for a run of character refs etc.
+        // does not set isEmitPending; read checks that
+        if (charsString == null) {
+            charsString = str;
+        }
+        else {
+            if (charsBuilder.length() == 0) { // switching to string builder as more than one emit before read
+                charsBuilder.append(charsString);
+            }
+            charsBuilder.append(str);
+        }
+    }
+
+    void emit(char[] chars) {
+        emit(String.valueOf(chars));
+    }
+
+    void emit(int[] codepoints) {
+        emit(new String(codepoints, 0, codepoints.length));
+    }
+
+    void emit(char c) {
+        emit(String.valueOf(c));
+    }
+
+    TokeniserState getState() {
+        return state;
+    }
+
+    void transition(TokeniserState state) {
+        this.state = state;
+    }
+
+    void advanceTransition(TokeniserState state) {
+        reader.advance();
+        this.state = state;
+    }
+
+    final private int[] codepointHolder = new int[1]; // holder to not have to keep creating arrays
+    final private int[] multipointHolder = new int[2];
+    int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
+        if (reader.isEmpty())
+            return null;
+        if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
+            return null;
+        if (reader.matchesAnySorted(notCharRefCharsSorted))
+            return null;
+
+        final int[] codeRef = codepointHolder;
+        reader.mark();
+        if (reader.matchConsume("#")) { // numbered
+            boolean isHexMode = reader.matchConsumeIgnoreCase("X");
+            String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
+            if (numRef.length() == 0) { // didn't match anything
+                characterReferenceError("numeric reference with no numerals");
+                reader.rewindToMark();
+                return null;
+            }
+            if (!reader.matchConsume(";"))
+                characterReferenceError("missing semicolon"); // missing semi
+            int charval = -1;
+            try {
+                int base = isHexMode ? 16 : 10;
+                charval = Integer.valueOf(numRef, base);
+            } catch (NumberFormatException ignored) {
+            } // skip
+            if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
+                characterReferenceError("character outside of valid range");
+                codeRef[0] = replacementChar;
+                return codeRef;
+            } else {
+                // fix illegal unicode characters to match browser behavior
+                if (charval >= win1252ExtensionsStart && charval < win1252ExtensionsStart + win1252Extensions.length) {
+                    characterReferenceError("character is not a valid unicode code point");
+                    charval = win1252Extensions[charval - win1252ExtensionsStart];
+                }
+
+                // todo: implement number replacement table
+                // todo: check for extra illegal unicode points as parse errors
+                codeRef[0] = charval;
+                return codeRef;
+            }
+        } else { // named
+            // get as many letters as possible, and look for matching entities.
+            String nameRef = reader.consumeLetterThenDigitSequence();
+            boolean looksLegit = reader.matches(';');
+            // found if a base named entity without a ;, or an extended entity with the ;.
+            boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));
+
+            if (!found) {
+                reader.rewindToMark();
+                if (looksLegit) // named with semicolon
+                    characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
+                return null;
+            }
+            if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
+                // don't want that to match
+                reader.rewindToMark();
+                return null;
+            }
+            if (!reader.matchConsume(";"))
+                characterReferenceError("missing semicolon"); // missing semi
+            int numChars = Entities.codepointsForName(nameRef, multipointHolder);
+            if (numChars == 1) {
+                codeRef[0] = multipointHolder[0];
+                return codeRef;
+            } else if (numChars ==2) {
+                return multipointHolder;
+            } else {
+                Validate.fail("Unexpected characters returned for " + nameRef);
+                return multipointHolder;
+            }
+        }
+    }
+
+    Token.Tag createTagPending(boolean start) {
+        tagPending = start ? startPending.reset() : endPending.reset();
+        return tagPending;
+    }
+
+    void emitTagPending() {
+        tagPending.finaliseTag();
+        emit(tagPending);
+    }
+
+    void createCommentPending() {
+        commentPending.reset();
+    }
+
+    void emitCommentPending() {
+        emit(commentPending);
+    }
+
+    void createDoctypePending() {
+        doctypePending.reset();
+    }
+
+    void emitDoctypePending() {
+        emit(doctypePending);
+    }
+
+    void createTempBuffer() {
+        Token.reset(dataBuffer);
+    }
+
+    boolean isAppropriateEndTagToken() {
+        return lastStartTag != null && tagPending.name().equalsIgnoreCase(lastStartTag);
+    }
+
+    String appropriateEndTagName() {
+        return lastStartTag; // could be null
+    }
+
+    void error(TokeniserState state) {
+        if (errors.canAddError())
+            errors.add(new ParseError(reader.pos(), "Unexpected character '%s' in input state [%s]", reader.current(), state));
+    }
+
+    void eofError(TokeniserState state) {
+        if (errors.canAddError())
+            errors.add(new ParseError(reader.pos(), "Unexpectedly reached end of file (EOF) in input state [%s]", state));
+    }
+
+    private void characterReferenceError(String message) {
+        if (errors.canAddError())
+            errors.add(new ParseError(reader.pos(), "Invalid character reference: %s", message));
+    }
+
+    void error(String errorMsg) {
+        if (errors.canAddError())
+            errors.add(new ParseError(reader.pos(), errorMsg));
+    }
+
+    boolean currentNodeInHtmlNS() {
+        // todo: implement namespaces correctly
+        return true;
+        // Element currentNode = currentNode();
+        // return currentNode != null && currentNode.namespace().equals("HTML");
+    }
+
+//    /**
+//     * Utility method to consume reader and unescape entities found within.
+//     * @param inAttribute if the text to be unescaped is in an attribute
+//     * @return unescaped string from reader
+//     */
+//    String unescapeEntities(boolean inAttribute) {
+//        StringBuilder builder = StringUtil.stringBuilder();
+//        while (!reader.isEmpty()) {
+//            builder.append(reader.consumeTo('&'));
+//            if (reader.matches('&')) {
+//                reader.consume();
+//                int[] c = consumeCharacterReference(null, inAttribute);
+//                if (c == null || c.length==0)
+//                    builder.append('&');
+//                else {
+//                    builder.appendCodePoint(c[0]);
+//                    if (c.length == 2)
+//                        builder.appendCodePoint(c[1]);
+//                }
+//
+//            }
+//        }
+//        return builder.toString();
+//    }
+}
--- a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/TokeniserState.java
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/TokeniserState.java
--- a/settings.gradle
+++ b/settings.gradle
@ -1 +1,2 @@
-include ':app', ':library', ':library-image-loader', ':library-view', ':sample-custom-extension', ':library-syntax'
+include ':app', ':library', ':library-image-loader', ':library-view', ':sample-custom-extension',
+        ':library-syntax', ':html-parser-api', ':html-parser-impl'
				`@ -0,0 +1 @@`
				`<manifest package="ru.noties.markwon.html" />`