Reuse HTML named entities w commonmark-java (less memory consumption)

2018-08-20 17:11:36 +03:00 · 2018-08-20 17:11:36 +03:00 · dc5cc9471c
commit dc5cc9471c
parent d382b37a72
10 changed files with 77 additions and 373 deletions
--- a/README.md
+++ b/README.md
@ -2,8 +2,6 @@

 # Markwon

-&amp; &copy; &dagger; &Dagger;
-
 [![markwon](https://img.shields.io/maven-central/v/ru.noties/markwon.svg?label=markwon)](http://search.maven.org/#search|ga|1|g%3A%22ru.noties%22%20AND%20a%3A%22markwon%22)
 [![markwon-image-loader](https://img.shields.io/maven-central/v/ru.noties/markwon-image-loader.svg?label=markwon-image-loader)](http://search.maven.org/#search|ga|1|g%3A%22ru.noties%22%20AND%20a%3A%22markwon-image-loader%22)
 [![markwon-syntax-highlight](https://img.shields.io/maven-central/v/ru.noties/markwon-syntax-highlight.svg?label=markwon-syntax-highlight)](http://search.maven.org/#search|ga|1|g%3A%22ru.noties%22%20AND%20a%3A%22markwon-syntax-highlight%22)
--- a/app/src/main/java/ru/noties/markwon/MarkdownRenderer.java
+++ b/app/src/main/java/ru/noties/markwon/MarkdownRenderer.java
@ -13,7 +13,6 @@ import java.util.concurrent.Future;
 import javax.inject.Inject;

 import ru.noties.debug.Debug;
-import ru.noties.markwon.html.api.MarkwonHtmlParser;
 import ru.noties.markwon.spans.AsyncDrawable;
 import ru.noties.markwon.spans.SpannableTheme;
 import ru.noties.markwon.syntax.Prism4jSyntaxHighlight;
@ -98,7 +97,6 @@ public class MarkdownRenderer {
                                .build())
                        .factory(new GifAwareSpannableFactory(gifPlaceholder))
                        .trimWhiteSpaceEnd(false)
-                        .htmlParser(MarkwonHtmlParser.noOp())
                        .build();

                final long start = SystemClock.uptimeMillis();
--- a/markwon-html-parser-impl/build.gradle
+++ b/markwon-html-parser-impl/build.gradle
@ -19,6 +19,7 @@ dependencies {

    deps.with {
        api it['support-annotations']
+        api it['commonmark']
    }

    deps.test.with {
--- a/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/jsoup/nodes/CommonMarkEntities.java
+++ b/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/jsoup/nodes/CommonMarkEntities.java
@ -0,0 +1,50 @@
+package ru.noties.markwon.html.impl.jsoup.nodes;
+
+import android.support.annotation.NonNull;
+
+import org.commonmark.internal.util.Html5Entities;
+
+import java.lang.reflect.Field;
+import java.util.Collections;
+import java.util.Map;
+
+public abstract class CommonMarkEntities {
+
+    public static boolean isNamedEntity(@NonNull String name) {
+        return COMMONMARK_NAMED_ENTITIES.containsKey(name);
+    }
+
+    public static int codepointsForName(@NonNull String name, @NonNull int[] codepoints) {
+        final String value = COMMONMARK_NAMED_ENTITIES.get(name);
+        if (value != null) {
+            final int length = value.length();
+            if (length == 1) {
+                codepoints[0] = value.charAt(0);
+            } else {
+                codepoints[0] = value.charAt(0);
+                codepoints[1] = value.charAt(1);
+            }
+            return length;
+        }
+        return 0;
+    }
+
+    private static final Map<String, String> COMMONMARK_NAMED_ENTITIES;
+
+    static {
+        Map<String, String> map;
+        try {
+            final Field field = Html5Entities.class.getDeclaredField("NAMED_CHARACTER_REFERENCES");
+            field.setAccessible(true);
+            //noinspection unchecked
+            map = (Map<String, String>) field.get(null);
+        } catch (Throwable t) {
+            map = Collections.emptyMap();
+            t.printStackTrace();
+        }
+        COMMONMARK_NAMED_ENTITIES = map;
+    }
+
+    private CommonMarkEntities() {
+    }
+}
--- a/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/jsoup/nodes/Entities.java
+++ b/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/jsoup/nodes/Entities.java
@ -1,351 +0,0 @@
-package ru.noties.markwon.html.impl.jsoup.nodes;
-
-import java.nio.charset.CharsetEncoder;
-import java.util.Arrays;
-import java.util.HashMap;
-
-import ru.noties.markwon.html.impl.jsoup.helper.Validate;
-import ru.noties.markwon.html.impl.jsoup.parser.CharacterReader;
-
-import static ru.noties.markwon.html.impl.jsoup.nodes.Entities.EscapeMode.base;
-import static ru.noties.markwon.html.impl.jsoup.nodes.Entities.EscapeMode.extended;
-
-/**
- * HTML entities, and escape routines. Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C
- * HTML named character references</a>.
- */
-public class Entities {
-    private static final int empty = -1;
-    private static final String emptyName = "";
-    static final int codepointRadix = 36;
-    private static final char[] codeDelims = {',', ';'};
-    private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references
-//    private static final Document.OutputSettings DefaultOutput = new Document.OutputSettings();
-
-    public enum EscapeMode {
-        /**
-         * Restricted entities suitable for XHTML output: lt, gt, amp, and quot only.
-         */
-        xhtml(EntitiesData.xmlPoints, 4),
-        /**
-         * Default HTML output entities.
-         */
-        base(EntitiesData.basePoints, 106),
-        /**
-         * Complete HTML entities.
-         */
-        extended(EntitiesData.fullPoints, 2125);
-
-        // table of named references to their codepoints. sorted so we can binary search. built by BuildEntities.
-        private String[] nameKeys;
-        private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints.
-
-        // table of codepoints to named entities.
-        private int[] codeKeys; // we don' support multicodepoints to single named value currently
-        private String[] nameVals;
-
-        EscapeMode(String file, int size) {
-            load(this, file, size);
-        }
-
-        int codepointForName(final String name) {
-            int index = Arrays.binarySearch(nameKeys, name);
-            return index >= 0 ? codeVals[index] : empty;
-        }
-
-        String nameForCodepoint(final int codepoint) {
-            final int index = Arrays.binarySearch(codeKeys, codepoint);
-            if (index >= 0) {
-                // the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower
-                // (and binary search for same item with multi results is undefined
-                return (index < nameVals.length - 1 && codeKeys[index + 1] == codepoint) ?
-                        nameVals[index + 1] : nameVals[index];
-            }
-            return emptyName;
-        }
-
-        private int size() {
-            return nameKeys.length;
-        }
-    }
-
-    private Entities() {
-    }
-
-    /**
-     * Check if the input is a known named entity
-     *
-     * @param name the possible entity name (e.g. "lt" or "amp")
-     * @return true if a known named entity
-     */
-    public static boolean isNamedEntity(final String name) {
-        return extended.codepointForName(name) != empty;
-    }
-
-    /**
-     * Check if the input is a known named entity in the base entity set.
-     *
-     * @param name the possible entity name (e.g. "lt" or "amp")
-     * @return true if a known named entity in the base set
-     * @see #isNamedEntity(String)
-     */
-    public static boolean isBaseNamedEntity(final String name) {
-        return base.codepointForName(name) != empty;
-    }
-
-    /**
-     * Get the Character value of the named entity
-     *
-     * @param name named entity (e.g. "lt" or "amp")
-     * @return the Character value of the named entity (e.g. '{@literal <}' or '{@literal &}')
-     * @deprecated does not support characters outside the BMP or multiple character names
-     */
-    public static Character getCharacterByName(String name) {
-        return (char) extended.codepointForName(name);
-    }
-
-    /**
-     * Get the character(s) represented by the named entity
-     *
-     * @param name entity (e.g. "lt" or "amp")
-     * @return the string value of the character(s) represented by this entity, or "" if not defined
-     */
-    public static String getByName(String name) {
-        String val = multipoints.get(name);
-        if (val != null)
-            return val;
-        int codepoint = extended.codepointForName(name);
-        if (codepoint != empty)
-            return new String(new int[]{codepoint}, 0, 1);
-        return emptyName;
-    }
-
-    public static int codepointsForName(final String name, final int[] codepoints) {
-        String val = multipoints.get(name);
-        if (val != null) {
-            codepoints[0] = val.codePointAt(0);
-            codepoints[1] = val.codePointAt(1);
-            return 2;
-        }
-        int codepoint = extended.codepointForName(name);
-        if (codepoint != empty) {
-            codepoints[0] = codepoint;
-            return 1;
-        }
-        return 0;
-    }
-
-//    /**
-//     * HTML escape an input string. That is, {@code <} is returned as {@code &lt;}
-//     *
-//     * @param string the un-escaped string to escape
-//     * @param out the output settings to use
-//     * @return the escaped string
-//     */
-//    public static String escape(String string, Document.OutputSettings out) {
-//        if (string == null)
-//            return "";
-//        StringBuilder accum = new StringBuilder(string.length() * 2);
-//        try {
-//            escape(accum, string, out, false, false, false);
-//        } catch (IOException e) {
-//            throw new SerializationException(e); // doesn't happen
-//        }
-//        return accum.toString();
-//    }
-
-//    /**
-//     * HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is returned as
-//     * {@code &lt;}
-//     *
-//     * @param string the un-escaped string to escape
-//     * @return the escaped string
-//     */
-//    public static String escape(String string) {
-//        return escape(string, DefaultOutput);
-//    }
-//
-//    // this method is ugly, and does a lot. but other breakups cause rescanning and stringbuilder generations
-//    static void escape(Appendable accum, String string, Document.OutputSettings out,
-//                       boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite) throws IOException {
-//
-//        boolean lastWasWhite = false;
-//        boolean reachedNonWhite = false;
-//        final EscapeMode escapeMode = out.escapeMode();
-//        final CharsetEncoder encoder = out.encoder();
-//        final CoreCharset coreCharset = out.coreCharset; // init in out.prepareEncoder()
-//        final int length = string.length();
-//
-//        int codePoint;
-//        for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) {
-//            codePoint = string.codePointAt(offset);
-//
-//            if (normaliseWhite) {
-//                if (StringUtil.isWhitespace(codePoint)) {
-//                    if ((stripLeadingWhite && !reachedNonWhite) || lastWasWhite)
-//                        continue;
-//                    accum.append(' ');
-//                    lastWasWhite = true;
-//                    continue;
-//                } else {
-//                    lastWasWhite = false;
-//                    reachedNonWhite = true;
-//                }
-//            }
-//            // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
-//            if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
-//                final char c = (char) codePoint;
-//                // html specific and required escapes:
-//                switch (c) {
-//                    case '&':
-//                        accum.append("&amp;");
-//                        break;
-//                    case 0xA0:
-//                        if (escapeMode != EscapeMode.xhtml)
-//                            accum.append("&nbsp;");
-//                        else
-//                            accum.append("&#xa0;");
-//                        break;
-//                    case '<':
-//                        // escape when in character data or when in a xml attribue val; not needed in html attr val
-//                        if (!inAttribute || escapeMode == EscapeMode.xhtml)
-//                            accum.append("&lt;");
-//                        else
-//                            accum.append(c);
-//                        break;
-//                    case '>':
-//                        if (!inAttribute)
-//                            accum.append("&gt;");
-//                        else
-//                            accum.append(c);
-//                        break;
-//                    case '"':
-//                        if (inAttribute)
-//                            accum.append("&quot;");
-//                        else
-//                            accum.append(c);
-//                        break;
-//                    default:
-//                        if (canEncode(coreCharset, c, encoder))
-//                            accum.append(c);
-//                        else
-//                            appendEncoded(accum, escapeMode, codePoint);
-//                }
-//            } else {
-//                final String c = new String(Character.toChars(codePoint));
-//                if (encoder.canEncode(c)) // uses fallback encoder for simplicity
-//                    accum.append(c);
-//                else
-//                    appendEncoded(accum, escapeMode, codePoint);
-//            }
-//        }
-//    }
-
-//    private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException {
-//        final String name = escapeMode.nameForCodepoint(codePoint);
-//        if (name != emptyName) // ok for identity check
-//            accum.append('&').append(name).append(';');
-//        else
-//            accum.append("&#x").append(Integer.toHexString(codePoint)).append(';');
-//    }
-
-//    /**
-//     * Un-escape an HTML escaped string. That is, {@code &lt;} is returned as {@code <}.
-//     *
-//     * @param string the HTML string to un-escape
-//     * @return the unescaped string
-//     */
-//    public static String unescape(String string) {
-//        return unescape(string, false);
-//    }
-
-//    /**
-//     * Unescape the input string.
-//     *
-//     * @param string to un-HTML-escape
-//     * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional)
-//     * @return unescaped string
-//     */
-//    static String unescape(String string, boolean strict) {
-//        return Parser.unescapeEntities(string, strict);
-//    }
-
-    /*
-     * Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean.
-     * After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF,
-     * performance may be bad. We can add more encoders for common character sets that are impacted by performance
-     * issues on Android if required.
-     *
-     * Benchmarks:     *
-     * OLD toHtml() impl v New (fastpath) in millis
-     * Wiki: 1895, 16
-     * CNN: 6378, 55
-     * Alterslash: 3013, 28
-     * Jsoup: 167, 2
-     */
-    private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) {
-        // todo add more charset tests if impacted by Android's bad perf in canEncode
-        switch (charset) {
-            case ascii:
-                return c < 0x80;
-            case utf:
-                return true; // real is:!(Character.isLowSurrogate(c) || Character.isHighSurrogate(c)); - but already check above
-            default:
-                return fallback.canEncode(c);
-        }
-    }
-
-    enum CoreCharset {
-        ascii, utf, fallback;
-
-        static CoreCharset byName(final String name) {
-            if (name.equals("US-ASCII"))
-                return ascii;
-            if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al
-                return utf;
-            return fallback;
-        }
-    }
-
-    private static void load(EscapeMode e, String pointsData, int size) {
-        e.nameKeys = new String[size];
-        e.codeVals = new int[size];
-        e.codeKeys = new int[size];
-        e.nameVals = new String[size];
-
-        int i = 0;
-        CharacterReader reader = new CharacterReader(pointsData);
-
-        while (!reader.isEmpty()) {
-            // NotNestedLessLess=10913,824;1887&
-
-            final String name = reader.consumeTo('=');
-            reader.advance();
-            final int cp1 = Integer.parseInt(reader.consumeToAny(codeDelims), codepointRadix);
-            final char codeDelim = reader.current();
-            reader.advance();
-            final int cp2;
-            if (codeDelim == ',') {
-                cp2 = Integer.parseInt(reader.consumeTo(';'), codepointRadix);
-                reader.advance();
-            } else {
-                cp2 = empty;
-            }
-            final String indexS = reader.consumeTo('&');
-            final int index = Integer.parseInt(indexS, codepointRadix);
-            reader.advance();
-
-            e.nameKeys[i] = name;
-            e.codeVals[i] = cp1;
-            e.codeKeys[index] = cp1;
-            e.nameVals[index] = name;
-
-            if (cp2 != empty) {
-                multipoints.put(name, new String(new int[]{cp1, cp2}, 0, 2));
-            }
-            i++;
-        }
-
-        Validate.isTrue(i == size, "Unexpected count of entities loaded");
-    }
-}
--- a/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/jsoup/nodes/EntitiesData.java
+++ b/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/jsoup/nodes/EntitiesData.java
--- a/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/jsoup/parser/Tokeniser.java
+++ b/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/jsoup/parser/Tokeniser.java
@ -3,7 +3,7 @@ package ru.noties.markwon.html.impl.jsoup.parser;
 import java.util.Arrays;

 import ru.noties.markwon.html.impl.jsoup.helper.Validate;
-import ru.noties.markwon.html.impl.jsoup.nodes.Entities;
+import ru.noties.markwon.html.impl.jsoup.nodes.CommonMarkEntities;

 /**
 * Readers the input stream into tokens.
@ -175,7 +175,7 @@ public final class Tokeniser {
            String nameRef = reader.consumeLetterThenDigitSequence();
            boolean looksLegit = reader.matches(';');
            // found if a base named entity without a ;, or an extended entity with the ;.
-            boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));
+            boolean found = (CommonMarkEntities.isNamedEntity(nameRef) && looksLegit);

            if (!found) {
                reader.rewindToMark();
@ -190,7 +190,7 @@ public final class Tokeniser {
            }
            if (!reader.matchConsume(";"))
                characterReferenceError("missing semicolon"); // missing semi
-            int numChars = Entities.codepointsForName(nameRef, multipointHolder);
+            int numChars = CommonMarkEntities.codepointsForName(nameRef, multipointHolder);
            if (numChars == 1) {
                codeRef[0] = multipointHolder[0];
                return codeRef;
--- a/markwon-html-parser-impl/src/test/java/ru/noties/markwon/html/impl/jsoup/nodes/CommonMarkEntitiesTest.java
+++ b/markwon-html-parser-impl/src/test/java/ru/noties/markwon/html/impl/jsoup/nodes/CommonMarkEntitiesTest.java
@ -0,0 +1,22 @@
+package ru.noties.markwon.html.impl.jsoup.nodes;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.robolectric.RobolectricTestRunner;
+import org.robolectric.annotation.Config;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+@RunWith(RobolectricTestRunner.class)
+@Config(manifest = Config.NONE)
+public class CommonMarkEntitiesTest {
+
+    @Test
+    public void can_access_field() {
+        assertTrue("&", CommonMarkEntities.isNamedEntity("amp"));
+        final int[] codepoints = new int[1];
+        CommonMarkEntities.codepointsForName("amp", codepoints);
+        assertEquals('&', codepoints[0]);
+    }
+}
--- a/markwon/src/main/java/ru/noties/markwon/SpannableConfiguration.java
+++ b/markwon/src/main/java/ru/noties/markwon/SpannableConfiguration.java
@ -244,7 +244,7 @@ public class SpannableConfiguration {
         * @param htmlIgnoreNonClosedTags that indicates if non-closed html tags should be kept open.
         *                                If this argument is false then all non-closed HTML tags
         *                                will be closed at the end of a document. Otherwise they will
-         *                                be delivered non-closed {@link HtmlTag#isClosed()}
+         *                                be delivered non-closed {@code HtmlTag#isClosed()}
         * @since 2.0.0
         */
        @NonNull
--- a/markwon/src/main/java/ru/noties/markwon/renderer/SpannableMarkdownVisitor.java
+++ b/markwon/src/main/java/ru/noties/markwon/renderer/SpannableMarkdownVisitor.java
@ -2,8 +2,6 @@ package ru.noties.markwon.renderer;

 import android.support.annotation.NonNull;
 import android.support.annotation.Nullable;
-import android.text.Spanned;
-import android.util.Log;

 import org.commonmark.ext.gfm.strikethrough.Strikethrough;
 import org.commonmark.ext.gfm.tables.TableBody;
@ -455,7 +453,6 @@ public class SpannableMarkdownVisitor extends AbstractVisitor {

    private void visitHtml(@Nullable String html) {
        if (html != null) {
-            Log.e("HTML", html);
            htmlParser.processFragment(builder, html);
        }
    }