Reuse HTML named entities w commonmark-java (less memory consumption)
This commit is contained in:
parent
d382b37a72
commit
dc5cc9471c
@ -2,8 +2,6 @@
|
||||
|
||||
# Markwon
|
||||
|
||||
& © † ‡
|
||||
|
||||
[](http://search.maven.org/#search|ga|1|g%3A%22ru.noties%22%20AND%20a%3A%22markwon%22)
|
||||
[](http://search.maven.org/#search|ga|1|g%3A%22ru.noties%22%20AND%20a%3A%22markwon-image-loader%22)
|
||||
[](http://search.maven.org/#search|ga|1|g%3A%22ru.noties%22%20AND%20a%3A%22markwon-syntax-highlight%22)
|
||||
|
@ -13,7 +13,6 @@ import java.util.concurrent.Future;
|
||||
import javax.inject.Inject;
|
||||
|
||||
import ru.noties.debug.Debug;
|
||||
import ru.noties.markwon.html.api.MarkwonHtmlParser;
|
||||
import ru.noties.markwon.spans.AsyncDrawable;
|
||||
import ru.noties.markwon.spans.SpannableTheme;
|
||||
import ru.noties.markwon.syntax.Prism4jSyntaxHighlight;
|
||||
@ -98,7 +97,6 @@ public class MarkdownRenderer {
|
||||
.build())
|
||||
.factory(new GifAwareSpannableFactory(gifPlaceholder))
|
||||
.trimWhiteSpaceEnd(false)
|
||||
.htmlParser(MarkwonHtmlParser.noOp())
|
||||
.build();
|
||||
|
||||
final long start = SystemClock.uptimeMillis();
|
||||
|
@ -19,6 +19,7 @@ dependencies {
|
||||
|
||||
deps.with {
|
||||
api it['support-annotations']
|
||||
api it['commonmark']
|
||||
}
|
||||
|
||||
deps.test.with {
|
||||
|
@ -0,0 +1,50 @@
|
||||
package ru.noties.markwon.html.impl.jsoup.nodes;
|
||||
|
||||
import android.support.annotation.NonNull;
|
||||
|
||||
import org.commonmark.internal.util.Html5Entities;
|
||||
|
||||
import java.lang.reflect.Field;
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
|
||||
public abstract class CommonMarkEntities {
|
||||
|
||||
public static boolean isNamedEntity(@NonNull String name) {
|
||||
return COMMONMARK_NAMED_ENTITIES.containsKey(name);
|
||||
}
|
||||
|
||||
public static int codepointsForName(@NonNull String name, @NonNull int[] codepoints) {
|
||||
final String value = COMMONMARK_NAMED_ENTITIES.get(name);
|
||||
if (value != null) {
|
||||
final int length = value.length();
|
||||
if (length == 1) {
|
||||
codepoints[0] = value.charAt(0);
|
||||
} else {
|
||||
codepoints[0] = value.charAt(0);
|
||||
codepoints[1] = value.charAt(1);
|
||||
}
|
||||
return length;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
private static final Map<String, String> COMMONMARK_NAMED_ENTITIES;
|
||||
|
||||
static {
|
||||
Map<String, String> map;
|
||||
try {
|
||||
final Field field = Html5Entities.class.getDeclaredField("NAMED_CHARACTER_REFERENCES");
|
||||
field.setAccessible(true);
|
||||
//noinspection unchecked
|
||||
map = (Map<String, String>) field.get(null);
|
||||
} catch (Throwable t) {
|
||||
map = Collections.emptyMap();
|
||||
t.printStackTrace();
|
||||
}
|
||||
COMMONMARK_NAMED_ENTITIES = map;
|
||||
}
|
||||
|
||||
private CommonMarkEntities() {
|
||||
}
|
||||
}
|
@ -1,351 +0,0 @@
|
||||
package ru.noties.markwon.html.impl.jsoup.nodes;
|
||||
|
||||
import java.nio.charset.CharsetEncoder;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
|
||||
import ru.noties.markwon.html.impl.jsoup.helper.Validate;
|
||||
import ru.noties.markwon.html.impl.jsoup.parser.CharacterReader;
|
||||
|
||||
import static ru.noties.markwon.html.impl.jsoup.nodes.Entities.EscapeMode.base;
|
||||
import static ru.noties.markwon.html.impl.jsoup.nodes.Entities.EscapeMode.extended;
|
||||
|
||||
/**
|
||||
* HTML entities, and escape routines. Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C
|
||||
* HTML named character references</a>.
|
||||
*/
|
||||
public class Entities {
|
||||
private static final int empty = -1;
|
||||
private static final String emptyName = "";
|
||||
static final int codepointRadix = 36;
|
||||
private static final char[] codeDelims = {',', ';'};
|
||||
private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references
|
||||
// private static final Document.OutputSettings DefaultOutput = new Document.OutputSettings();
|
||||
|
||||
public enum EscapeMode {
|
||||
/**
|
||||
* Restricted entities suitable for XHTML output: lt, gt, amp, and quot only.
|
||||
*/
|
||||
xhtml(EntitiesData.xmlPoints, 4),
|
||||
/**
|
||||
* Default HTML output entities.
|
||||
*/
|
||||
base(EntitiesData.basePoints, 106),
|
||||
/**
|
||||
* Complete HTML entities.
|
||||
*/
|
||||
extended(EntitiesData.fullPoints, 2125);
|
||||
|
||||
// table of named references to their codepoints. sorted so we can binary search. built by BuildEntities.
|
||||
private String[] nameKeys;
|
||||
private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints.
|
||||
|
||||
// table of codepoints to named entities.
|
||||
private int[] codeKeys; // we don' support multicodepoints to single named value currently
|
||||
private String[] nameVals;
|
||||
|
||||
EscapeMode(String file, int size) {
|
||||
load(this, file, size);
|
||||
}
|
||||
|
||||
int codepointForName(final String name) {
|
||||
int index = Arrays.binarySearch(nameKeys, name);
|
||||
return index >= 0 ? codeVals[index] : empty;
|
||||
}
|
||||
|
||||
String nameForCodepoint(final int codepoint) {
|
||||
final int index = Arrays.binarySearch(codeKeys, codepoint);
|
||||
if (index >= 0) {
|
||||
// the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower
|
||||
// (and binary search for same item with multi results is undefined
|
||||
return (index < nameVals.length - 1 && codeKeys[index + 1] == codepoint) ?
|
||||
nameVals[index + 1] : nameVals[index];
|
||||
}
|
||||
return emptyName;
|
||||
}
|
||||
|
||||
private int size() {
|
||||
return nameKeys.length;
|
||||
}
|
||||
}
|
||||
|
||||
private Entities() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the input is a known named entity
|
||||
*
|
||||
* @param name the possible entity name (e.g. "lt" or "amp")
|
||||
* @return true if a known named entity
|
||||
*/
|
||||
public static boolean isNamedEntity(final String name) {
|
||||
return extended.codepointForName(name) != empty;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the input is a known named entity in the base entity set.
|
||||
*
|
||||
* @param name the possible entity name (e.g. "lt" or "amp")
|
||||
* @return true if a known named entity in the base set
|
||||
* @see #isNamedEntity(String)
|
||||
*/
|
||||
public static boolean isBaseNamedEntity(final String name) {
|
||||
return base.codepointForName(name) != empty;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the Character value of the named entity
|
||||
*
|
||||
* @param name named entity (e.g. "lt" or "amp")
|
||||
* @return the Character value of the named entity (e.g. '{@literal <}' or '{@literal &}')
|
||||
* @deprecated does not support characters outside the BMP or multiple character names
|
||||
*/
|
||||
public static Character getCharacterByName(String name) {
|
||||
return (char) extended.codepointForName(name);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the character(s) represented by the named entity
|
||||
*
|
||||
* @param name entity (e.g. "lt" or "amp")
|
||||
* @return the string value of the character(s) represented by this entity, or "" if not defined
|
||||
*/
|
||||
public static String getByName(String name) {
|
||||
String val = multipoints.get(name);
|
||||
if (val != null)
|
||||
return val;
|
||||
int codepoint = extended.codepointForName(name);
|
||||
if (codepoint != empty)
|
||||
return new String(new int[]{codepoint}, 0, 1);
|
||||
return emptyName;
|
||||
}
|
||||
|
||||
public static int codepointsForName(final String name, final int[] codepoints) {
|
||||
String val = multipoints.get(name);
|
||||
if (val != null) {
|
||||
codepoints[0] = val.codePointAt(0);
|
||||
codepoints[1] = val.codePointAt(1);
|
||||
return 2;
|
||||
}
|
||||
int codepoint = extended.codepointForName(name);
|
||||
if (codepoint != empty) {
|
||||
codepoints[0] = codepoint;
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// /**
|
||||
// * HTML escape an input string. That is, {@code <} is returned as {@code <}
|
||||
// *
|
||||
// * @param string the un-escaped string to escape
|
||||
// * @param out the output settings to use
|
||||
// * @return the escaped string
|
||||
// */
|
||||
// public static String escape(String string, Document.OutputSettings out) {
|
||||
// if (string == null)
|
||||
// return "";
|
||||
// StringBuilder accum = new StringBuilder(string.length() * 2);
|
||||
// try {
|
||||
// escape(accum, string, out, false, false, false);
|
||||
// } catch (IOException e) {
|
||||
// throw new SerializationException(e); // doesn't happen
|
||||
// }
|
||||
// return accum.toString();
|
||||
// }
|
||||
|
||||
// /**
|
||||
// * HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is returned as
|
||||
// * {@code <}
|
||||
// *
|
||||
// * @param string the un-escaped string to escape
|
||||
// * @return the escaped string
|
||||
// */
|
||||
// public static String escape(String string) {
|
||||
// return escape(string, DefaultOutput);
|
||||
// }
|
||||
//
|
||||
// // this method is ugly, and does a lot. but other breakups cause rescanning and stringbuilder generations
|
||||
// static void escape(Appendable accum, String string, Document.OutputSettings out,
|
||||
// boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite) throws IOException {
|
||||
//
|
||||
// boolean lastWasWhite = false;
|
||||
// boolean reachedNonWhite = false;
|
||||
// final EscapeMode escapeMode = out.escapeMode();
|
||||
// final CharsetEncoder encoder = out.encoder();
|
||||
// final CoreCharset coreCharset = out.coreCharset; // init in out.prepareEncoder()
|
||||
// final int length = string.length();
|
||||
//
|
||||
// int codePoint;
|
||||
// for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) {
|
||||
// codePoint = string.codePointAt(offset);
|
||||
//
|
||||
// if (normaliseWhite) {
|
||||
// if (StringUtil.isWhitespace(codePoint)) {
|
||||
// if ((stripLeadingWhite && !reachedNonWhite) || lastWasWhite)
|
||||
// continue;
|
||||
// accum.append(' ');
|
||||
// lastWasWhite = true;
|
||||
// continue;
|
||||
// } else {
|
||||
// lastWasWhite = false;
|
||||
// reachedNonWhite = true;
|
||||
// }
|
||||
// }
|
||||
// // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
|
||||
// if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
|
||||
// final char c = (char) codePoint;
|
||||
// // html specific and required escapes:
|
||||
// switch (c) {
|
||||
// case '&':
|
||||
// accum.append("&");
|
||||
// break;
|
||||
// case 0xA0:
|
||||
// if (escapeMode != EscapeMode.xhtml)
|
||||
// accum.append(" ");
|
||||
// else
|
||||
// accum.append(" ");
|
||||
// break;
|
||||
// case '<':
|
||||
// // escape when in character data or when in a xml attribue val; not needed in html attr val
|
||||
// if (!inAttribute || escapeMode == EscapeMode.xhtml)
|
||||
// accum.append("<");
|
||||
// else
|
||||
// accum.append(c);
|
||||
// break;
|
||||
// case '>':
|
||||
// if (!inAttribute)
|
||||
// accum.append(">");
|
||||
// else
|
||||
// accum.append(c);
|
||||
// break;
|
||||
// case '"':
|
||||
// if (inAttribute)
|
||||
// accum.append(""");
|
||||
// else
|
||||
// accum.append(c);
|
||||
// break;
|
||||
// default:
|
||||
// if (canEncode(coreCharset, c, encoder))
|
||||
// accum.append(c);
|
||||
// else
|
||||
// appendEncoded(accum, escapeMode, codePoint);
|
||||
// }
|
||||
// } else {
|
||||
// final String c = new String(Character.toChars(codePoint));
|
||||
// if (encoder.canEncode(c)) // uses fallback encoder for simplicity
|
||||
// accum.append(c);
|
||||
// else
|
||||
// appendEncoded(accum, escapeMode, codePoint);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException {
|
||||
// final String name = escapeMode.nameForCodepoint(codePoint);
|
||||
// if (name != emptyName) // ok for identity check
|
||||
// accum.append('&').append(name).append(';');
|
||||
// else
|
||||
// accum.append("&#x").append(Integer.toHexString(codePoint)).append(';');
|
||||
// }
|
||||
|
||||
// /**
|
||||
// * Un-escape an HTML escaped string. That is, {@code <} is returned as {@code <}.
|
||||
// *
|
||||
// * @param string the HTML string to un-escape
|
||||
// * @return the unescaped string
|
||||
// */
|
||||
// public static String unescape(String string) {
|
||||
// return unescape(string, false);
|
||||
// }
|
||||
|
||||
// /**
|
||||
// * Unescape the input string.
|
||||
// *
|
||||
// * @param string to un-HTML-escape
|
||||
// * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional)
|
||||
// * @return unescaped string
|
||||
// */
|
||||
// static String unescape(String string, boolean strict) {
|
||||
// return Parser.unescapeEntities(string, strict);
|
||||
// }
|
||||
|
||||
/*
|
||||
* Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean.
|
||||
* After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF,
|
||||
* performance may be bad. We can add more encoders for common character sets that are impacted by performance
|
||||
* issues on Android if required.
|
||||
*
|
||||
* Benchmarks: *
|
||||
* OLD toHtml() impl v New (fastpath) in millis
|
||||
* Wiki: 1895, 16
|
||||
* CNN: 6378, 55
|
||||
* Alterslash: 3013, 28
|
||||
* Jsoup: 167, 2
|
||||
*/
|
||||
private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) {
|
||||
// todo add more charset tests if impacted by Android's bad perf in canEncode
|
||||
switch (charset) {
|
||||
case ascii:
|
||||
return c < 0x80;
|
||||
case utf:
|
||||
return true; // real is:!(Character.isLowSurrogate(c) || Character.isHighSurrogate(c)); - but already check above
|
||||
default:
|
||||
return fallback.canEncode(c);
|
||||
}
|
||||
}
|
||||
|
||||
enum CoreCharset {
|
||||
ascii, utf, fallback;
|
||||
|
||||
static CoreCharset byName(final String name) {
|
||||
if (name.equals("US-ASCII"))
|
||||
return ascii;
|
||||
if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al
|
||||
return utf;
|
||||
return fallback;
|
||||
}
|
||||
}
|
||||
|
||||
private static void load(EscapeMode e, String pointsData, int size) {
|
||||
e.nameKeys = new String[size];
|
||||
e.codeVals = new int[size];
|
||||
e.codeKeys = new int[size];
|
||||
e.nameVals = new String[size];
|
||||
|
||||
int i = 0;
|
||||
CharacterReader reader = new CharacterReader(pointsData);
|
||||
|
||||
while (!reader.isEmpty()) {
|
||||
// NotNestedLessLess=10913,824;1887&
|
||||
|
||||
final String name = reader.consumeTo('=');
|
||||
reader.advance();
|
||||
final int cp1 = Integer.parseInt(reader.consumeToAny(codeDelims), codepointRadix);
|
||||
final char codeDelim = reader.current();
|
||||
reader.advance();
|
||||
final int cp2;
|
||||
if (codeDelim == ',') {
|
||||
cp2 = Integer.parseInt(reader.consumeTo(';'), codepointRadix);
|
||||
reader.advance();
|
||||
} else {
|
||||
cp2 = empty;
|
||||
}
|
||||
final String indexS = reader.consumeTo('&');
|
||||
final int index = Integer.parseInt(indexS, codepointRadix);
|
||||
reader.advance();
|
||||
|
||||
e.nameKeys[i] = name;
|
||||
e.codeVals[i] = cp1;
|
||||
e.codeKeys[index] = cp1;
|
||||
e.nameVals[index] = name;
|
||||
|
||||
if (cp2 != empty) {
|
||||
multipoints.put(name, new String(new int[]{cp1, cp2}, 0, 2));
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
Validate.isTrue(i == size, "Unexpected count of entities loaded");
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
@ -3,7 +3,7 @@ package ru.noties.markwon.html.impl.jsoup.parser;
|
||||
import java.util.Arrays;
|
||||
|
||||
import ru.noties.markwon.html.impl.jsoup.helper.Validate;
|
||||
import ru.noties.markwon.html.impl.jsoup.nodes.Entities;
|
||||
import ru.noties.markwon.html.impl.jsoup.nodes.CommonMarkEntities;
|
||||
|
||||
/**
|
||||
* Readers the input stream into tokens.
|
||||
@ -175,7 +175,7 @@ public final class Tokeniser {
|
||||
String nameRef = reader.consumeLetterThenDigitSequence();
|
||||
boolean looksLegit = reader.matches(';');
|
||||
// found if a base named entity without a ;, or an extended entity with the ;.
|
||||
boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));
|
||||
boolean found = (CommonMarkEntities.isNamedEntity(nameRef) && looksLegit);
|
||||
|
||||
if (!found) {
|
||||
reader.rewindToMark();
|
||||
@ -190,7 +190,7 @@ public final class Tokeniser {
|
||||
}
|
||||
if (!reader.matchConsume(";"))
|
||||
characterReferenceError("missing semicolon"); // missing semi
|
||||
int numChars = Entities.codepointsForName(nameRef, multipointHolder);
|
||||
int numChars = CommonMarkEntities.codepointsForName(nameRef, multipointHolder);
|
||||
if (numChars == 1) {
|
||||
codeRef[0] = multipointHolder[0];
|
||||
return codeRef;
|
||||
|
@ -0,0 +1,22 @@
|
||||
package ru.noties.markwon.html.impl.jsoup.nodes;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.robolectric.RobolectricTestRunner;
|
||||
import org.robolectric.annotation.Config;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
@RunWith(RobolectricTestRunner.class)
|
||||
@Config(manifest = Config.NONE)
|
||||
public class CommonMarkEntitiesTest {
|
||||
|
||||
@Test
|
||||
public void can_access_field() {
|
||||
assertTrue("&", CommonMarkEntities.isNamedEntity("amp"));
|
||||
final int[] codepoints = new int[1];
|
||||
CommonMarkEntities.codepointsForName("amp", codepoints);
|
||||
assertEquals('&', codepoints[0]);
|
||||
}
|
||||
}
|
@ -244,7 +244,7 @@ public class SpannableConfiguration {
|
||||
* @param htmlIgnoreNonClosedTags that indicates if non-closed html tags should be kept open.
|
||||
* If this argument is false then all non-closed HTML tags
|
||||
* will be closed at the end of a document. Otherwise they will
|
||||
* be delivered non-closed {@link HtmlTag#isClosed()}
|
||||
* be delivered non-closed {@code HtmlTag#isClosed()}
|
||||
* @since 2.0.0
|
||||
*/
|
||||
@NonNull
|
||||
|
@ -2,8 +2,6 @@ package ru.noties.markwon.renderer;
|
||||
|
||||
import android.support.annotation.NonNull;
|
||||
import android.support.annotation.Nullable;
|
||||
import android.text.Spanned;
|
||||
import android.util.Log;
|
||||
|
||||
import org.commonmark.ext.gfm.strikethrough.Strikethrough;
|
||||
import org.commonmark.ext.gfm.tables.TableBody;
|
||||
@ -455,7 +453,6 @@ public class SpannableMarkdownVisitor extends AbstractVisitor {
|
||||
|
||||
private void visitHtml(@Nullable String html) {
|
||||
if (html != null) {
|
||||
Log.e("HTML", html);
|
||||
htmlParser.processFragment(builder, html);
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user