, Cloneable {
+// private static final String[] booleanAttributes = {
+// "allowfullscreen", "async", "autofocus", "checked", "compact", "declare", "default", "defer", "disabled",
+// "formnovalidate", "hidden", "inert", "ismap", "itemscope", "multiple", "muted", "nohref", "noresize",
+// "noshade", "novalidate", "nowrap", "open", "readonly", "required", "reversed", "seamless", "selected",
+// "sortable", "truespeed", "typemustmatch"
+// };
+
+ private String key;
+ private String val;
+ Attributes parent; // used to update the holding Attributes when the key / value is changed via this interface
+
+ /**
+ * Create a new attribute from unencoded (raw) key and value.
+ * @param key attribute key; case is preserved.
+ * @param value attribute value
+ */
+ public Attribute(String key, String value) {
+ this(key, value, null);
+ }
+
+ /**
+ * Create a new attribute from unencoded (raw) key and value.
+ * @param key attribute key; case is preserved.
+ * @param val attribute value
+ * @param parent the containing Attributes (this Attribute is not automatically added to said Attributes)
+ */
+ public Attribute(String key, String val, Attributes parent) {
+ Validate.notNull(key);
+ this.key = key.trim();
+ Validate.notEmpty(key); // trimming could potentially make empty, so validate here
+ this.val = val;
+ this.parent = parent;
+ }
+
+ /**
+ Get the attribute key.
+ @return the attribute key
+ */
+ public String getKey() {
+ return key;
+ }
+
+ /**
+ Set the attribute key; case is preserved.
+ @param key the new key; must not be null
+ */
+ public void setKey(String key) {
+ Validate.notNull(key);
+ key = key.trim();
+ Validate.notEmpty(key); // trimming could potentially make empty, so validate here
+ if (parent != null) {
+ int i = parent.indexOfKey(this.key);
+ if (i != Attributes.NotFound)
+ parent.keys[i] = key;
+ }
+ this.key = key;
+ }
+
+ /**
+ Get the attribute value.
+ @return the attribute value
+ */
+ public String getValue() {
+ return val;
+ }
+
+ /**
+ Set the attribute value.
+ @param val the new attribute value; must not be null
+ */
+ public String setValue(String val) {
+ String oldVal = parent.get(this.key);
+ if (parent != null) {
+ int i = parent.indexOfKey(this.key);
+ if (i != Attributes.NotFound)
+ parent.vals[i] = val;
+ }
+ this.val = val;
+ return oldVal;
+ }
+
+// /**
+// Get the HTML representation of this attribute; e.g. {@code href="index.html"}.
+// @return HTML
+// */
+// public String html() {
+// StringBuilder accum = new StringBuilder();
+//
+// try {
+// html(accum, (new Document("")).outputSettings());
+// } catch(IOException exception) {
+// throw new SerializationException(exception);
+// }
+// return accum.toString();
+// }
+//
+// protected static void html(String key, String val, Appendable accum, Document.OutputSettings out) throws IOException {
+// accum.append(key);
+// if (!shouldCollapseAttribute(key, val, out)) {
+// accum.append("=\"");
+// Entities.escape(accum, Attributes.checkNotNull(val) , out, true, false, false);
+// accum.append('"');
+// }
+// }
+//
+// protected void html(Appendable accum, Document.OutputSettings out) throws IOException {
+// html(key, val, accum, out);
+// }
+
+// /**
+// Get the string representation of this attribute, implemented as {@link #html()}.
+// @return string
+// */
+// @Override
+// public String toString() {
+// return html();
+// }
+
+// /**
+// * Create a new Attribute from an unencoded key and a HTML attribute encoded value.
+// * @param unencodedKey assumes the key is not encoded, as can be only run of simple \w chars.
+// * @param encodedValue HTML attribute encoded value
+// * @return attribute
+// */
+// public static Attribute createFromEncoded(String unencodedKey, String encodedValue) {
+// String value = Entities.unescape(encodedValue, true);
+// return new Attribute(unencodedKey, value, null); // parent will get set when Put
+// }
+
+ protected boolean isDataAttribute() {
+ return isDataAttribute(key);
+ }
+
+ protected static boolean isDataAttribute(String key) {
+ return key.startsWith(Attributes.dataPrefix) && key.length() > Attributes.dataPrefix.length();
+ }
+
+// /**
+// * Collapsible if it's a boolean attribute and value is empty or same as name
+// *
+// * @param out output settings
+// * @return Returns whether collapsible or not
+// */
+// protected final boolean shouldCollapseAttribute(Document.OutputSettings out) {
+// return shouldCollapseAttribute(key, val, out);
+// }
+
+// protected static boolean shouldCollapseAttribute(final String key, final String val, final Document.OutputSettings out) {
+// return (
+// out.syntax() == Document.OutputSettings.Syntax.html &&
+// (val == null || ("".equals(val) || val.equalsIgnoreCase(key)) && Attribute.isBooleanAttribute(key)));
+// }
+
+// /**
+// * @deprecated
+// */
+// protected boolean isBooleanAttribute() {
+// return Arrays.binarySearch(booleanAttributes, key) >= 0 || val == null;
+// }
+//
+// /**
+// * Checks if this attribute name is defined as a boolean attribute in HTML5
+// */
+// protected static boolean isBooleanAttribute(final String key) {
+// return Arrays.binarySearch(booleanAttributes, key) >= 0;
+// }
+
+ @Override
+ public boolean equals(Object o) { // note parent not considered
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+ Attribute attribute = (Attribute) o;
+ if (key != null ? !key.equals(attribute.key) : attribute.key != null) return false;
+ return val != null ? val.equals(attribute.val) : attribute.val == null;
+ }
+
+ @Override
+ public int hashCode() { // note parent not considered
+ int result = key != null ? key.hashCode() : 0;
+ result = 31 * result + (val != null ? val.hashCode() : 0);
+ return result;
+ }
+
+ @Override
+ public Attribute clone() {
+ try {
+ return (Attribute) super.clone();
+ } catch (CloneNotSupportedException e) {
+ throw new RuntimeException(e);
+ }
+ }
+}
diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Attributes.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Attributes.java
new file mode 100644
index 00000000..f00ecfe1
--- /dev/null
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Attributes.java
@@ -0,0 +1,444 @@
+package ru.noties.markwon.html.jsoup.nodes;
+
+import java.util.AbstractMap;
+import java.util.AbstractSet;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import ru.noties.markwon.html.jsoup.helper.Validate;
+
+import static ru.noties.markwon.html.jsoup.helper.Normalizer.lowerCase;
+
+/**
+ * The attributes of an Element.
+ *
+ * Attributes are treated as a map: there can be only one value associated with an attribute key/name.
+ *
+ *
+ * Attribute name and value comparisons are generally case sensitive. By default for HTML, attribute names are
+ * normalized to lower-case on parsing. That means you should use lower-case strings when referring to attributes by
+ * name.
+ *
+ *
+ * @author Jonathan Hedley, jonathan@hedley.net
+ */
+public class Attributes implements Iterable, Cloneable {
+ protected static final String dataPrefix = "data-";
+ private static final int InitialCapacity = 4; // todo - analyze Alexa 1MM sites, determine best setting
+
+ // manages the key/val arrays
+ private static final int GrowthFactor = 2;
+ private static final String[] Empty = {};
+ static final int NotFound = -1;
+ private static final String EmptyString = "";
+
+ private int size = 0; // number of slots used (not capacity, which is keys.length
+ String[] keys = Empty;
+ String[] vals = Empty;
+
+ // check there's room for more
+ private void checkCapacity(int minNewSize) {
+ Validate.isTrue(minNewSize >= size);
+ int curSize = keys.length;
+ if (curSize >= minNewSize)
+ return;
+
+ int newSize = curSize >= InitialCapacity ? size * GrowthFactor : InitialCapacity;
+ if (minNewSize > newSize)
+ newSize = minNewSize;
+
+ keys = copyOf(keys, newSize);
+ vals = copyOf(vals, newSize);
+ }
+
+ // simple implementation of Arrays.copy, for support of Android API 8.
+ private static String[] copyOf(String[] orig, int size) {
+ final String[] copy = new String[size];
+ System.arraycopy(orig, 0, copy, 0,
+ Math.min(orig.length, size));
+ return copy;
+ }
+
+ int indexOfKey(String key) {
+ Validate.notNull(key);
+ for (int i = 0; i < size; i++) {
+ if (key.equals(keys[i]))
+ return i;
+ }
+ return NotFound;
+ }
+
+ private int indexOfKeyIgnoreCase(String key) {
+ Validate.notNull(key);
+ for (int i = 0; i < size; i++) {
+ if (key.equalsIgnoreCase(keys[i]))
+ return i;
+ }
+ return NotFound;
+ }
+
+ // we track boolean attributes as null in values - they're just keys. so returns empty for consumers
+ static String checkNotNull(String val) {
+ return val == null ? EmptyString : val;
+ }
+
+ /**
+ Get an attribute value by key.
+ @param key the (case-sensitive) attribute key
+ @return the attribute value if set; or empty string if not set (or a boolean attribute).
+ @see #hasKey(String)
+ */
+ public String get(String key) {
+ int i = indexOfKey(key);
+ return i == NotFound ? EmptyString : checkNotNull(vals[i]);
+ }
+
+ /**
+ * Get an attribute's value by case-insensitive key
+ * @param key the attribute name
+ * @return the first matching attribute value if set; or empty string if not set (ora boolean attribute).
+ */
+ public String getIgnoreCase(String key) {
+ int i = indexOfKeyIgnoreCase(key);
+ return i == NotFound ? EmptyString : checkNotNull(vals[i]);
+ }
+
+ // adds without checking if this key exists
+ private void add(String key, String value) {
+ checkCapacity(size + 1);
+ keys[size] = key;
+ vals[size] = value;
+ size++;
+ }
+
+ /**
+ * Set a new attribute, or replace an existing one by key.
+ * @param key case sensitive attribute key
+ * @param value attribute value
+ * @return these attributes, for chaining
+ */
+ public Attributes put(String key, String value) {
+ int i = indexOfKey(key);
+ if (i != NotFound)
+ vals[i] = value;
+ else
+ add(key, value);
+ return this;
+ }
+
+ void putIgnoreCase(String key, String value) {
+ int i = indexOfKeyIgnoreCase(key);
+ if (i != NotFound) {
+ vals[i] = value;
+ if (!keys[i].equals(key)) // case changed, update
+ keys[i] = key;
+ }
+ else
+ add(key, value);
+ }
+
+ /**
+ * Set a new boolean attribute, remove attribute if value is false.
+ * @param key case insensitive attribute key
+ * @param value attribute value
+ * @return these attributes, for chaining
+ */
+ public Attributes put(String key, boolean value) {
+ if (value)
+ putIgnoreCase(key, null);
+ else
+ remove(key);
+ return this;
+ }
+
+ /**
+ Set a new attribute, or replace an existing one by key.
+ @param attribute attribute with case sensitive key
+ @return these attributes, for chaining
+ */
+ public Attributes put(Attribute attribute) {
+ Validate.notNull(attribute);
+ put(attribute.getKey(), attribute.getValue());
+ attribute.parent = this;
+ return this;
+ }
+
+ // removes and shifts up
+ private void remove(int index) {
+ Validate.isFalse(index >= size);
+ int shifted = size - index - 1;
+ if (shifted > 0) {
+ System.arraycopy(keys, index + 1, keys, index, shifted);
+ System.arraycopy(vals, index + 1, vals, index, shifted);
+ }
+ size--;
+ keys[size] = null; // release hold
+ vals[size] = null;
+ }
+
+ /**
+ Remove an attribute by key. Case sensitive.
+ @param key attribute key to remove
+ */
+ public void remove(String key) {
+ int i = indexOfKey(key);
+ if (i != NotFound)
+ remove(i);
+ }
+
+ /**
+ Remove an attribute by key. Case insensitive.
+ @param key attribute key to remove
+ */
+ public void removeIgnoreCase(String key) {
+ int i = indexOfKeyIgnoreCase(key);
+ if (i != NotFound)
+ remove(i);
+ }
+
+ /**
+ Tests if these attributes contain an attribute with this key.
+ @param key case-sensitive key to check for
+ @return true if key exists, false otherwise
+ */
+ public boolean hasKey(String key) {
+ return indexOfKey(key) != NotFound;
+ }
+
+ /**
+ Tests if these attributes contain an attribute with this key.
+ @param key key to check for
+ @return true if key exists, false otherwise
+ */
+ public boolean hasKeyIgnoreCase(String key) {
+ return indexOfKeyIgnoreCase(key) != NotFound;
+ }
+
+ /**
+ Get the number of attributes in this set.
+ @return size
+ */
+ public int size() {
+ return size;
+ }
+
+ /**
+ Add all the attributes from the incoming set to this set.
+ @param incoming attributes to add to these attributes.
+ */
+ public void addAll(Attributes incoming) {
+ if (incoming.size() == 0)
+ return;
+ checkCapacity(size + incoming.size);
+
+ for (Attribute attr : incoming) {
+ // todo - should this be case insensitive?
+ put(attr);
+ }
+
+ }
+
+ public Iterator iterator() {
+ return new Iterator() {
+ int i = 0;
+
+ @Override
+ public boolean hasNext() {
+ return i < size;
+ }
+
+ @Override
+ public Attribute next() {
+ final Attribute attr = new Attribute(keys[i], vals[i], Attributes.this);
+ i++;
+ return attr;
+ }
+
+ @Override
+ public void remove() {
+ Attributes.this.remove(--i); // next() advanced, so rewind
+ }
+ };
+ }
+
+ /**
+ Get the attributes as a List, for iteration.
+ @return an view of the attributes as an unmodifialbe List.
+ */
+ public List asList() {
+ ArrayList list = new ArrayList<>(size);
+ for (int i = 0; i < size; i++) {
+// Attribute attr = vals[i] == null ?
+// new BooleanAttribute(keys[i]) : // deprecated class, but maybe someone still wants it
+// new Attribute(keys[i], vals[i], Attributes.this);
+// list.add(attr);
+ list.add(new Attribute(keys[i], vals[i], Attributes.this));
+ }
+ return Collections.unmodifiableList(list);
+ }
+
+ /**
+ * Retrieves a filtered view of attributes that are HTML5 custom data attributes; that is, attributes with keys
+ * starting with {@code data-}.
+ * @return map of custom data attributes.
+ */
+ public Map dataset() {
+ return new Dataset(this);
+ }
+
+// /**
+// Get the HTML representation of these attributes.
+// @return HTML
+// @throws SerializationException if the HTML representation of the attributes cannot be constructed.
+// */
+// public String html() {
+// StringBuilder accum = new StringBuilder();
+// try {
+// html(accum, (new Document("")).outputSettings()); // output settings a bit funky, but this html() seldom used
+// } catch (IOException e) { // ought never happen
+// throw new SerializationException(e);
+// }
+// return accum.toString();
+// }
+//
+// final void html(final Appendable accum, final Document.OutputSettings out) throws IOException {
+// final int sz = size;
+// for (int i = 0; i < sz; i++) {
+// // inlined from Attribute.html()
+// final String key = keys[i];
+// final String val = vals[i];
+// accum.append(' ').append(key);
+//
+// // collapse checked=null, checked="", checked=checked; write out others
+// if (!Attribute.shouldCollapseAttribute(key, val, out)) {
+// accum.append("=\"");
+// Entities.escape(accum, val == null ? EmptyString : val, out, true, false, false);
+// accum.append('"');
+// }
+// }
+// }
+//
+// @Override
+// public String toString() {
+// return html();
+// }
+
+ /**
+ * Checks if these attributes are equal to another set of attributes, by comparing the two sets
+ * @param o attributes to compare with
+ * @return if both sets of attributes have the same content
+ */
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ Attributes that = (Attributes) o;
+
+ if (size != that.size) return false;
+ if (!Arrays.equals(keys, that.keys)) return false;
+ return Arrays.equals(vals, that.vals);
+ }
+
+ /**
+ * Calculates the hashcode of these attributes, by iterating all attributes and summing their hashcodes.
+ * @return calculated hashcode
+ */
+ @Override
+ public int hashCode() {
+ int result = size;
+ result = 31 * result + Arrays.hashCode(keys);
+ result = 31 * result + Arrays.hashCode(vals);
+ return result;
+ }
+
+ @Override
+ public Attributes clone() {
+ Attributes clone;
+ try {
+ clone = (Attributes) super.clone();
+ } catch (CloneNotSupportedException e) {
+ throw new RuntimeException(e);
+ }
+ clone.size = size;
+ keys = copyOf(keys, size);
+ vals = copyOf(vals, size);
+ return clone;
+ }
+
+ /**
+ * Internal method. Lowercases all keys.
+ */
+ public void normalize() {
+ for (int i = 0; i < size; i++) {
+ keys[i] = lowerCase(keys[i]);
+ }
+ }
+
+ private static class Dataset extends AbstractMap {
+ private final Attributes attributes;
+
+ private Dataset(Attributes attributes) {
+ this.attributes = attributes;
+ }
+
+ @Override
+ public Set> entrySet() {
+ return new EntrySet();
+ }
+
+ @Override
+ public String put(String key, String value) {
+ String dataKey = dataKey(key);
+ String oldValue = attributes.hasKey(dataKey) ? attributes.get(dataKey) : null;
+ attributes.put(dataKey, value);
+ return oldValue;
+ }
+
+ private class EntrySet extends AbstractSet> {
+
+ @Override
+ public Iterator> iterator() {
+ return new DatasetIterator();
+ }
+
+ @Override
+ public int size() {
+ int count = 0;
+ Iterator iter = new DatasetIterator();
+ while (iter.hasNext())
+ count++;
+ return count;
+ }
+ }
+
+ private class DatasetIterator implements Iterator> {
+ private Iterator attrIter = attributes.iterator();
+ private Attribute attr;
+ public boolean hasNext() {
+ while (attrIter.hasNext()) {
+ attr = attrIter.next();
+ if (attr.isDataAttribute()) return true;
+ }
+ return false;
+ }
+
+ public Entry next() {
+ return new Attribute(attr.getKey().substring(dataPrefix.length()), attr.getValue());
+ }
+
+ public void remove() {
+ attributes.remove(attr.getKey());
+ }
+ }
+ }
+
+ private static String dataKey(String key) {
+ return dataPrefix + key;
+ }
+}
\ No newline at end of file
diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/DocumentType.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/DocumentType.java
new file mode 100644
index 00000000..dc11e537
--- /dev/null
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/DocumentType.java
@@ -0,0 +1,104 @@
+package ru.noties.markwon.html.jsoup.nodes;
+
+/**
+ * A {@code } node.
+ */
+public class DocumentType /*extends LeafNode*/ {
+ // todo needs a bit of a chunky cleanup. this level of detail isn't needed
+ public static final String PUBLIC_KEY = "PUBLIC";
+ public static final String SYSTEM_KEY = "SYSTEM";
+// private static final String NAME = "name";
+// private static final String PUB_SYS_KEY = "pubSysKey"; // PUBLIC or SYSTEM
+// private static final String PUBLIC_ID = "publicId";
+// private static final String SYSTEM_ID = "systemId";
+ // todo: quirk mode from publicId and systemId
+
+// /**
+// * Create a new doctype element.
+// * @param name the doctype's name
+// * @param publicId the doctype's public ID
+// * @param systemId the doctype's system ID
+// */
+// public DocumentType(String name, String publicId, String systemId) {
+// Validate.notNull(name);
+// Validate.notNull(publicId);
+// Validate.notNull(systemId);
+// attr(NAME, name);
+// attr(PUBLIC_ID, publicId);
+// if (has(PUBLIC_ID)) {
+// attr(PUB_SYS_KEY, PUBLIC_KEY);
+// }
+// attr(SYSTEM_ID, systemId);
+// }
+//
+// /**
+// * Create a new doctype element.
+// * @param name the doctype's name
+// * @param publicId the doctype's public ID
+// * @param systemId the doctype's system ID
+// * @param baseUri unused
+// * @deprecated
+// */
+// public DocumentType(String name, String publicId, String systemId, String baseUri) {
+// attr(NAME, name);
+// attr(PUBLIC_ID, publicId);
+// if (has(PUBLIC_ID)) {
+// attr(PUB_SYS_KEY, PUBLIC_KEY);
+// }
+// attr(SYSTEM_ID, systemId);
+// }
+//
+// /**
+// * Create a new doctype element.
+// * @param name the doctype's name
+// * @param publicId the doctype's public ID
+// * @param systemId the doctype's system ID
+// * @param baseUri unused
+// * @deprecated
+// */
+// public DocumentType(String name, String pubSysKey, String publicId, String systemId, String baseUri) {
+// attr(NAME, name);
+// if (pubSysKey != null) {
+// attr(PUB_SYS_KEY, pubSysKey);
+// }
+// attr(PUBLIC_ID, publicId);
+// attr(SYSTEM_ID, systemId);
+// }
+// public void setPubSysKey(String value) {
+// if (value != null)
+// attr(PUB_SYS_KEY, value);
+// }
+//
+// @Override
+// public String nodeName() {
+// return "#doctype";
+// }
+//
+// @Override
+// void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
+// if (out.syntax() == Syntax.html && !has(PUBLIC_ID) && !has(SYSTEM_ID)) {
+// // looks like a html5 doctype, go lowercase for aesthetics
+// accum.append("');
+// }
+//
+// @Override
+// void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) {
+// }
+//
+// private boolean has(final String attribute) {
+// return !StringUtil.isBlank(attr(attribute));
+// }
+}
+
diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Entities.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Entities.java
new file mode 100644
index 00000000..c6c8d829
--- /dev/null
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Entities.java
@@ -0,0 +1,351 @@
+package ru.noties.markwon.html.jsoup.nodes;
+
+import java.nio.charset.CharsetEncoder;
+import java.util.Arrays;
+import java.util.HashMap;
+
+import ru.noties.markwon.html.jsoup.helper.Validate;
+import ru.noties.markwon.html.jsoup.parser.CharacterReader;
+
+import static ru.noties.markwon.html.jsoup.nodes.Entities.EscapeMode.base;
+import static ru.noties.markwon.html.jsoup.nodes.Entities.EscapeMode.extended;
+
+/**
+ * HTML entities, and escape routines. Source: W3C
+ * HTML named character references.
+ */
+public class Entities {
+ private static final int empty = -1;
+ private static final String emptyName = "";
+ static final int codepointRadix = 36;
+ private static final char[] codeDelims = {',', ';'};
+ private static final HashMap multipoints = new HashMap<>(); // name -> multiple character references
+// private static final Document.OutputSettings DefaultOutput = new Document.OutputSettings();
+
+ public enum EscapeMode {
+ /**
+ * Restricted entities suitable for XHTML output: lt, gt, amp, and quot only.
+ */
+ xhtml(EntitiesData.xmlPoints, 4),
+ /**
+ * Default HTML output entities.
+ */
+ base(EntitiesData.basePoints, 106),
+ /**
+ * Complete HTML entities.
+ */
+ extended(EntitiesData.fullPoints, 2125);
+
+ // table of named references to their codepoints. sorted so we can binary search. built by BuildEntities.
+ private String[] nameKeys;
+ private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints.
+
+ // table of codepoints to named entities.
+ private int[] codeKeys; // we don' support multicodepoints to single named value currently
+ private String[] nameVals;
+
+ EscapeMode(String file, int size) {
+ load(this, file, size);
+ }
+
+ int codepointForName(final String name) {
+ int index = Arrays.binarySearch(nameKeys, name);
+ return index >= 0 ? codeVals[index] : empty;
+ }
+
+ String nameForCodepoint(final int codepoint) {
+ final int index = Arrays.binarySearch(codeKeys, codepoint);
+ if (index >= 0) {
+ // the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower
+ // (and binary search for same item with multi results is undefined
+ return (index < nameVals.length - 1 && codeKeys[index + 1] == codepoint) ?
+ nameVals[index + 1] : nameVals[index];
+ }
+ return emptyName;
+ }
+
+ private int size() {
+ return nameKeys.length;
+ }
+ }
+
+ private Entities() {
+ }
+
+ /**
+ * Check if the input is a known named entity
+ *
+ * @param name the possible entity name (e.g. "lt" or "amp")
+ * @return true if a known named entity
+ */
+ public static boolean isNamedEntity(final String name) {
+ return extended.codepointForName(name) != empty;
+ }
+
+ /**
+ * Check if the input is a known named entity in the base entity set.
+ *
+ * @param name the possible entity name (e.g. "lt" or "amp")
+ * @return true if a known named entity in the base set
+ * @see #isNamedEntity(String)
+ */
+ public static boolean isBaseNamedEntity(final String name) {
+ return base.codepointForName(name) != empty;
+ }
+
+ /**
+ * Get the Character value of the named entity
+ *
+ * @param name named entity (e.g. "lt" or "amp")
+ * @return the Character value of the named entity (e.g. '{@literal <}' or '{@literal &}')
+ * @deprecated does not support characters outside the BMP or multiple character names
+ */
+ public static Character getCharacterByName(String name) {
+ return (char) extended.codepointForName(name);
+ }
+
+ /**
+ * Get the character(s) represented by the named entity
+ *
+ * @param name entity (e.g. "lt" or "amp")
+ * @return the string value of the character(s) represented by this entity, or "" if not defined
+ */
+ public static String getByName(String name) {
+ String val = multipoints.get(name);
+ if (val != null)
+ return val;
+ int codepoint = extended.codepointForName(name);
+ if (codepoint != empty)
+ return new String(new int[]{codepoint}, 0, 1);
+ return emptyName;
+ }
+
+ public static int codepointsForName(final String name, final int[] codepoints) {
+ String val = multipoints.get(name);
+ if (val != null) {
+ codepoints[0] = val.codePointAt(0);
+ codepoints[1] = val.codePointAt(1);
+ return 2;
+ }
+ int codepoint = extended.codepointForName(name);
+ if (codepoint != empty) {
+ codepoints[0] = codepoint;
+ return 1;
+ }
+ return 0;
+ }
+
+// /**
+// * HTML escape an input string. That is, {@code <} is returned as {@code <}
+// *
+// * @param string the un-escaped string to escape
+// * @param out the output settings to use
+// * @return the escaped string
+// */
+// public static String escape(String string, Document.OutputSettings out) {
+// if (string == null)
+// return "";
+// StringBuilder accum = new StringBuilder(string.length() * 2);
+// try {
+// escape(accum, string, out, false, false, false);
+// } catch (IOException e) {
+// throw new SerializationException(e); // doesn't happen
+// }
+// return accum.toString();
+// }
+
+// /**
+// * HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is returned as
+// * {@code <}
+// *
+// * @param string the un-escaped string to escape
+// * @return the escaped string
+// */
+// public static String escape(String string) {
+// return escape(string, DefaultOutput);
+// }
+//
+// // this method is ugly, and does a lot. but other breakups cause rescanning and stringbuilder generations
+// static void escape(Appendable accum, String string, Document.OutputSettings out,
+// boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite) throws IOException {
+//
+// boolean lastWasWhite = false;
+// boolean reachedNonWhite = false;
+// final EscapeMode escapeMode = out.escapeMode();
+// final CharsetEncoder encoder = out.encoder();
+// final CoreCharset coreCharset = out.coreCharset; // init in out.prepareEncoder()
+// final int length = string.length();
+//
+// int codePoint;
+// for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) {
+// codePoint = string.codePointAt(offset);
+//
+// if (normaliseWhite) {
+// if (StringUtil.isWhitespace(codePoint)) {
+// if ((stripLeadingWhite && !reachedNonWhite) || lastWasWhite)
+// continue;
+// accum.append(' ');
+// lastWasWhite = true;
+// continue;
+// } else {
+// lastWasWhite = false;
+// reachedNonWhite = true;
+// }
+// }
+// // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
+// if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
+// final char c = (char) codePoint;
+// // html specific and required escapes:
+// switch (c) {
+// case '&':
+// accum.append("&");
+// break;
+// case 0xA0:
+// if (escapeMode != EscapeMode.xhtml)
+// accum.append(" ");
+// else
+// accum.append(" ");
+// break;
+// case '<':
+// // escape when in character data or when in a xml attribue val; not needed in html attr val
+// if (!inAttribute || escapeMode == EscapeMode.xhtml)
+// accum.append("<");
+// else
+// accum.append(c);
+// break;
+// case '>':
+// if (!inAttribute)
+// accum.append(">");
+// else
+// accum.append(c);
+// break;
+// case '"':
+// if (inAttribute)
+// accum.append(""");
+// else
+// accum.append(c);
+// break;
+// default:
+// if (canEncode(coreCharset, c, encoder))
+// accum.append(c);
+// else
+// appendEncoded(accum, escapeMode, codePoint);
+// }
+// } else {
+// final String c = new String(Character.toChars(codePoint));
+// if (encoder.canEncode(c)) // uses fallback encoder for simplicity
+// accum.append(c);
+// else
+// appendEncoded(accum, escapeMode, codePoint);
+// }
+// }
+// }
+
+// private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException {
+// final String name = escapeMode.nameForCodepoint(codePoint);
+// if (name != emptyName) // ok for identity check
+// accum.append('&').append(name).append(';');
+// else
+// accum.append("").append(Integer.toHexString(codePoint)).append(';');
+// }
+
+// /**
+// * Un-escape an HTML escaped string. That is, {@code <} is returned as {@code <}.
+// *
+// * @param string the HTML string to un-escape
+// * @return the unescaped string
+// */
+// public static String unescape(String string) {
+// return unescape(string, false);
+// }
+
+// /**
+// * Unescape the input string.
+// *
+// * @param string to un-HTML-escape
+// * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional)
+// * @return unescaped string
+// */
+// static String unescape(String string, boolean strict) {
+// return Parser.unescapeEntities(string, strict);
+// }
+
+ /*
+ * Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean.
+ * After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF,
+ * performance may be bad. We can add more encoders for common character sets that are impacted by performance
+ * issues on Android if required.
+ *
+ * Benchmarks: *
+ * OLD toHtml() impl v New (fastpath) in millis
+ * Wiki: 1895, 16
+ * CNN: 6378, 55
+ * Alterslash: 3013, 28
+ * Jsoup: 167, 2
+ */
+ private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) {
+ // todo add more charset tests if impacted by Android's bad perf in canEncode
+ switch (charset) {
+ case ascii:
+ return c < 0x80;
+ case utf:
+ return true; // real is:!(Character.isLowSurrogate(c) || Character.isHighSurrogate(c)); - but already check above
+ default:
+ return fallback.canEncode(c);
+ }
+ }
+
+ enum CoreCharset {
+ ascii, utf, fallback;
+
+ static CoreCharset byName(final String name) {
+ if (name.equals("US-ASCII"))
+ return ascii;
+ if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al
+ return utf;
+ return fallback;
+ }
+ }
+
+ private static void load(EscapeMode e, String pointsData, int size) {
+ e.nameKeys = new String[size];
+ e.codeVals = new int[size];
+ e.codeKeys = new int[size];
+ e.nameVals = new String[size];
+
+ int i = 0;
+ CharacterReader reader = new CharacterReader(pointsData);
+
+ while (!reader.isEmpty()) {
+ // NotNestedLessLess=10913,824;1887&
+
+ final String name = reader.consumeTo('=');
+ reader.advance();
+ final int cp1 = Integer.parseInt(reader.consumeToAny(codeDelims), codepointRadix);
+ final char codeDelim = reader.current();
+ reader.advance();
+ final int cp2;
+ if (codeDelim == ',') {
+ cp2 = Integer.parseInt(reader.consumeTo(';'), codepointRadix);
+ reader.advance();
+ } else {
+ cp2 = empty;
+ }
+ final String indexS = reader.consumeTo('&');
+ final int index = Integer.parseInt(indexS, codepointRadix);
+ reader.advance();
+
+ e.nameKeys[i] = name;
+ e.codeVals[i] = cp1;
+ e.codeKeys[index] = cp1;
+ e.nameVals[index] = name;
+
+ if (cp2 != empty) {
+ multipoints.put(name, new String(new int[]{cp1, cp2}, 0, 2));
+ }
+ i++;
+ }
+
+ Validate.isTrue(i == size, "Unexpected count of entities loaded");
+ }
+}
diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/EntitiesData.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/EntitiesData.java
new file mode 100644
index 00000000..036c712f
--- /dev/null
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/EntitiesData.java
@@ -0,0 +1,11 @@
+package ru.noties.markwon.html.jsoup.nodes;
+
+/**
+ * Holds packed data that represents Entity name=value pairs. Parsed by Entities, created by BuildEntities.
+ */
+class EntitiesData {
+ static final String xmlPoints = "amp=12;1>=1q;3<=1o;2"=y;0&";
+ static final String basePoints = "AElig=5i;1c&=12;2Á=5d;17Â=5e;18À=5c;16Å=5h;1bÃ=5f;19Ä=5g;1a©=4p;hÇ=5j;1dÐ=5s;1mÉ=5l;1fÊ=5m;1gÈ=5k;1eË=5n;1h>=1q;6Í=5p;1jÎ=5q;1kÌ=5o;1iÏ=5r;1l<=1o;4Ñ=5t;1nÓ=5v;1pÔ=5w;1qÒ=5u;1oØ=60;1uÕ=5x;1rÖ=5y;1s"=y;0®=4u;nÞ=66;20Ú=62;1wÛ=63;1xÙ=61;1vÜ=64;1yÝ=65;1zá=69;23â=6a;24´=50;uæ=6e;28à=68;22&=12;3å=6d;27ã=6b;25ä=6c;26¦=4m;eç=6f;29¸=54;y¢=4i;a©=4p;i¤=4k;c°=4w;q÷=6v;2pé=6h;2bê=6i;2cè=6g;2að=6o;2ië=6j;2d½=59;13¼=58;12¾=5a;14>=1q;7í=6l;2fî=6m;2g¡=4h;9ì=6k;2e¿=5b;15ï=6n;2h«=4r;k<=1o;5¯=4v;pµ=51;v·=53;x =4g;8¬=4s;lñ=6p;2jó=6r;2lô=6s;2mò=6q;2kª=4q;jº=56;10ø=6w;2qõ=6t;2nö=6u;2o¶=52;w±=4x;r£=4j;b"=y;1»=57;11®=4u;o§=4n;f­=4t;m¹=55;z²=4y;s³=4z;tß=67;21þ=72;2w×=5z;1tú=6y;2sû=6z;2tù=6x;2r¨=4o;gü=70;2uý=71;2v¥=4l;dÿ=73;2x&";
+ static final String fullPoints = "AElig=5i;2v&=12;8Á=5d;2p&Abreve=76;4kÂ=5e;2q&Acy=sw;av&Afr=2kn8;1khÀ=5c;2o&Alpha=pd;8d&Amacr=74;4i&And=8cz;1e1&Aogon=78;4m&Aopf=2koo;1ls&ApplyFunction=6e9;ewÅ=5h;2t&Ascr=2kkc;1jc&Assign=6s4;s6Ã=5f;2rÄ=5g;2s&Backslash=6qe;o1&Barv=8h3;1it&Barwed=6x2;120&Bcy=sx;aw&Because=6r9;pw&Bernoullis=6jw;gn&Beta=pe;8e&Bfr=2kn9;1ki&Bopf=2kop;1lt&Breve=k8;82&Bscr=6jw;gp&Bumpeq=6ry;ro&CHcy=tj;bi©=4p;1q&Cacute=7a;4o&Cap=6vm;zz&CapitalDifferentialD=6kl;h8&Cayleys=6jx;gq&Ccaron=7g;4uÇ=5j;2w&Ccirc=7c;4q&Cconint=6r4;pn&Cdot=7e;4s&Cedilla=54;2e&CenterDot=53;2b&Cfr=6jx;gr&Chi=pz;8y&CircleDot=6u1;x8&CircleMinus=6ty;x3&CirclePlus=6tx;x1&CircleTimes=6tz;x5&ClockwiseContourIntegral=6r6;pp&CloseCurlyDoubleQuote=6cd;e0&CloseCurlyQuote=6c9;dt&Colon=6rb;q1&Colone=8dw;1en&Congruent=6sh;sn&Conint=6r3;pm&ContourIntegral=6r2;pi&Copf=6iq;f7&Coproduct=6q8;nq&CounterClockwiseContourIntegral=6r7;pr&Cross=8bz;1d8&Cscr=2kke;1jd&Cup=6vn;100&CupCap=6rx;rk&DD=6kl;h9&DDotrahd=841;184&DJcy=si;ai&DScy=sl;al&DZcy=sv;au&Dagger=6ch;e7&Darr=6n5;j5&Dashv=8h0;1ir&Dcaron=7i;4w&Dcy=t0;az&Del=6pz;n9&Delta=pg;8g&Dfr=2knb;1kj&DiacriticalAcute=50;27&DiacriticalDot=k9;84&DiacriticalDoubleAcute=kd;8a&DiacriticalGrave=2o;13&DiacriticalTilde=kc;88&Diamond=6v8;za&DifferentialD=6km;ha&Dopf=2kor;1lu&Dot=4o;1n&DotDot=6ho;f5&DotEqual=6s0;rw&DoubleContourIntegral=6r3;pl&DoubleDot=4o;1m&DoubleDownArrow=6oj;m0&DoubleLeftArrow=6og;lq&DoubleLeftRightArrow=6ok;m3&DoubleLeftTee=8h0;1iq&DoubleLongLeftArrow=7w8;17g&DoubleLongLeftRightArrow=7wa;17m&DoubleLongRightArrow=7w9;17j&DoubleRightArrow=6oi;lw&DoubleRightTee=6ug;xz&DoubleUpArrow=6oh;lt&DoubleUpDownArrow=6ol;m7&DoubleVerticalBar=6qt;ov&DownArrow=6mr;i8&DownArrowBar=843;186&DownArrowUpArrow=6ph;mn&DownBreve=lt;8c&DownLeftRightVector=85s;198&DownLeftTeeVector=866;19m&DownLeftVector=6nx;ke&DownLeftVectorBar=85y;19e&DownRightTeeVector=867;19n&DownRightVector=6o1;kq&DownRightVectorBar=85z;19f&DownTee=6uc;xs&DownTeeArrow=6nb;jh&Downarrow=6oj;m1&Dscr=2kkf;1je&Dstrok=7k;4y&ENG=96;6gÐ=5s;35É=5l;2y&Ecaron=7u;56Ê=5m;2z&Ecy=tp;bo&Edot=7q;52&Efr=2knc;1kkÈ=5k;2x&Element=6q0;na&Emacr=7m;50&EmptySmallSquare=7i3;15x&EmptyVerySmallSquare=7fv;150&Eogon=7s;54&Eopf=2kos;1lv&Epsilon=ph;8h&Equal=8dx;1eo&EqualTilde=6rm;qp&Equilibrium=6oc;li&Escr=6k0;gu&Esim=8dv;1em&Eta=pj;8jË=5n;30&Exists=6pv;mz&ExponentialE=6kn;hc&Fcy=tg;bf&Ffr=2knd;1kl&FilledSmallSquare=7i4;15y&FilledVerySmallSquare=7fu;14w&Fopf=2kot;1lw&ForAll=6ps;ms&Fouriertrf=6k1;gv&Fscr=6k1;gw&GJcy=sj;aj>=1q;r&Gamma=pf;8f&Gammad=rg;a5&Gbreve=7y;5a&Gcedil=82;5e&Gcirc=7w;58&Gcy=sz;ay&Gdot=80;5c&Gfr=2kne;1km&Gg=6vt;10c&Gopf=2kou;1lx&GreaterEqual=6sl;sv&GreaterEqualLess=6vv;10i&GreaterFullEqual=6sn;t6&GreaterGreater=8f6;1gh&GreaterLess=6t3;ul&GreaterSlantEqual=8e6;1f5&GreaterTilde=6sz;ub&Gscr=2kki;1jf&Gt=6sr;tr&HARDcy=tm;bl&Hacek=jr;80&Hat=2m;10&Hcirc=84;5f&Hfr=6j0;fe&HilbertSpace=6iz;fa&Hopf=6j1;fg&HorizontalLine=7b4;13i&Hscr=6iz;fc&Hstrok=86;5h&HumpDownHump=6ry;rn&HumpEqual=6rz;rs&IEcy=t1;b0&IJlig=8i;5s&IOcy=sh;ahÍ=5p;32Î=5q;33&Icy=t4;b3&Idot=8g;5p&Ifr=6j5;fqÌ=5o;31&Im=6j5;fr&Imacr=8a;5l&ImaginaryI=6ko;hf&Implies=6oi;ly&Int=6r0;pf&Integral=6qz;pd&Intersection=6v6;z4&InvisibleComma=6eb;f0&InvisibleTimes=6ea;ey&Iogon=8e;5n&Iopf=2kow;1ly&Iota=pl;8l&Iscr=6j4;fn&Itilde=88;5j&Iukcy=sm;amÏ=5r;34&Jcirc=8k;5u&Jcy=t5;b4&Jfr=2knh;1kn&Jopf=2kox;1lz&Jscr=2kkl;1jg&Jsercy=so;ao&Jukcy=sk;ak&KHcy=th;bg&KJcy=ss;as&Kappa=pm;8m&Kcedil=8m;5w&Kcy=t6;b5&Kfr=2kni;1ko&Kopf=2koy;1m0&Kscr=2kkm;1jh&LJcy=sp;ap<=1o;m&Lacute=8p;5z&Lambda=pn;8n&Lang=7vu;173&Laplacetrf=6j6;fs&Larr=6n2;j1&Lcaron=8t;63&Lcedil=8r;61&Lcy=t7;b6&LeftAngleBracket=7vs;16x&LeftArrow=6mo;hu&LeftArrowBar=6p0;mj&LeftArrowRightArrow=6o6;l3&LeftCeiling=6x4;121&LeftDoubleBracket=7vq;16t&LeftDownTeeVector=869;19p&LeftDownVector=6o3;kw&LeftDownVectorBar=861;19h&LeftFloor=6x6;125&LeftRightArrow=6ms;ib&LeftRightVector=85q;196&LeftTee=6ub;xq&LeftTeeArrow=6n8;ja&LeftTeeVector=862;19i&LeftTriangle=6uq;ya&LeftTriangleBar=89b;1c0&LeftTriangleEqual=6us;yg&LeftUpDownVector=85t;199&LeftUpTeeVector=868;19o&LeftUpVector=6nz;kk&LeftUpVectorBar=860;19g&LeftVector=6nw;kb&LeftVectorBar=85u;19a&Leftarrow=6og;lr&Leftrightarrow=6ok;m4&LessEqualGreater=6vu;10e&LessFullEqual=6sm;t0&LessGreater=6t2;ui&LessLess=8f5;1gf&LessSlantEqual=8e5;1ez&LessTilde=6sy;u8&Lfr=2knj;1kp&Ll=6vs;109&Lleftarrow=6oq;me&Lmidot=8v;65&LongLeftArrow=7w5;177&LongLeftRightArrow=7w7;17d&LongRightArrow=7w6;17a&Longleftarrow=7w8;17h&Longleftrightarrow=7wa;17n&Longrightarrow=7w9;17k&Lopf=2koz;1m1&LowerLeftArrow=6mx;iq&LowerRightArrow=6mw;in&Lscr=6j6;fu&Lsh=6nk;jv&Lstrok=8x;67&Lt=6sq;tl&Map=83p;17v&Mcy=t8;b7&MediumSpace=6e7;eu&Mellintrf=6k3;gx&Mfr=2knk;1kq&MinusPlus=6qb;nv&Mopf=2kp0;1m2&Mscr=6k3;gz&Mu=po;8o&NJcy=sq;aq&Nacute=8z;69&Ncaron=93;6d&Ncedil=91;6b&Ncy=t9;b8&NegativeMediumSpace=6bv;dc&NegativeThickSpace=6bv;dd&NegativeThinSpace=6bv;de&NegativeVeryThinSpace=6bv;db&NestedGreaterGreater=6sr;tq&NestedLessLess=6sq;tk&NewLine=a;1&Nfr=2knl;1kr&NoBreak=6e8;ev&NonBreakingSpace=4g;1d&Nopf=6j9;fx&Not=8h8;1ix&NotCongruent=6si;sp&NotCupCap=6st;tv&NotDoubleVerticalBar=6qu;p0&NotElement=6q1;ne&NotEqual=6sg;sk&NotEqualTilde=6rm,mw;qn&NotExists=6pw;n1&NotGreater=6sv;tz&NotGreaterEqual=6sx;u5&NotGreaterFullEqual=6sn,mw;t3&NotGreaterGreater=6sr,mw;tn&NotGreaterLess=6t5;uq&NotGreaterSlantEqual=8e6,mw;1f2&NotGreaterTilde=6t1;ug&NotHumpDownHump=6ry,mw;rl&NotHumpEqual=6rz,mw;rq&NotLeftTriangle=6wa;113&NotLeftTriangleBar=89b,mw;1bz&NotLeftTriangleEqual=6wc;119&NotLess=6su;tw&NotLessEqual=6sw;u2&NotLessGreater=6t4;uo&NotLessLess=6sq,mw;th&NotLessSlantEqual=8e5,mw;1ew&NotLessTilde=6t0;ue&NotNestedGreaterGreater=8f6,mw;1gg&NotNestedLessLess=8f5,mw;1ge&NotPrecedes=6tc;vb&NotPrecedesEqual=8fj,mw;1gv&NotPrecedesSlantEqual=6w0;10p&NotReverseElement=6q4;nl&NotRightTriangle=6wb;116&NotRightTriangleBar=89c,mw;1c1&NotRightTriangleEqual=6wd;11c&NotSquareSubset=6tr,mw;wh&NotSquareSubsetEqual=6w2;10t&NotSquareSuperset=6ts,mw;wl&NotSquareSupersetEqual=6w3;10v&NotSubset=6te,6he;vh&NotSubsetEqual=6tk;w0&NotSucceeds=6td;ve&NotSucceedsEqual=8fk,mw;1h1&NotSucceedsSlantEqual=6w1;10r&NotSucceedsTilde=6tb,mw;v7&NotSuperset=6tf,6he;vm&NotSupersetEqual=6tl;w3&NotTilde=6rl;ql&NotTildeEqual=6ro;qv&NotTildeFullEqual=6rr;r1&NotTildeTilde=6rt;r9&NotVerticalBar=6qs;or&Nscr=2kkp;1jiÑ=5t;36&Nu=pp;8p&OElig=9e;6mÓ=5v;38Ô=5w;39&Ocy=ta;b9&Odblac=9c;6k&Ofr=2knm;1ksÒ=5u;37&Omacr=98;6i&Omega=q1;90&Omicron=pr;8r&Oopf=2kp2;1m3&OpenCurlyDoubleQuote=6cc;dy&OpenCurlyQuote=6c8;dr&Or=8d0;1e2&Oscr=2kkq;1jjØ=60;3dÕ=5x;3a&Otimes=8c7;1dfÖ=5y;3b&OverBar=6da;em&OverBrace=732;13b&OverBracket=71w;134&OverParenthesis=730;139&PartialD=6pu;mx&Pcy=tb;ba&Pfr=2knn;1kt&Phi=py;8x&Pi=ps;8s&PlusMinus=4x;22&Poincareplane=6j0;fd&Popf=6jd;g3&Pr=8fv;1hl&Precedes=6t6;us&PrecedesEqual=8fj;1gy&PrecedesSlantEqual=6t8;uy&PrecedesTilde=6ta;v4&Prime=6cz;eg&Product=6q7;no&Proportion=6rb;q0&Proportional=6ql;oa&Pscr=2kkr;1jk&Psi=q0;8z"=y;3&Qfr=2kno;1ku&Qopf=6je;g5&Qscr=2kks;1jl&RBarr=840;183®=4u;1x&Racute=9g;6o&Rang=7vv;174&Rarr=6n4;j4&Rarrtl=846;187&Rcaron=9k;6s&Rcedil=9i;6q&Rcy=tc;bb&Re=6jg;gb&ReverseElement=6q3;nh&ReverseEquilibrium=6ob;le&ReverseUpEquilibrium=86n;1a4&Rfr=6jg;ga&Rho=pt;8t&RightAngleBracket=7vt;170&RightArrow=6mq;i3&RightArrowBar=6p1;ml&RightArrowLeftArrow=6o4;ky&RightCeiling=6x5;123&RightDoubleBracket=7vr;16v&RightDownTeeVector=865;19l&RightDownVector=6o2;kt&RightDownVectorBar=85x;19d&RightFloor=6x7;127&RightTee=6ua;xo&RightTeeArrow=6na;je&RightTeeVector=863;19j&RightTriangle=6ur;yd&RightTriangleBar=89c;1c2&RightTriangleEqual=6ut;yk&RightUpDownVector=85r;197&RightUpTeeVector=864;19k&RightUpVector=6ny;kh&RightUpVectorBar=85w;19c&RightVector=6o0;kn&RightVectorBar=85v;19b&Rightarrow=6oi;lx&Ropf=6jh;gd&RoundImplies=86o;1a6&Rrightarrow=6or;mg&Rscr=6jf;g7&Rsh=6nl;jx&RuleDelayed=8ac;1cb&SHCHcy=tl;bk&SHcy=tk;bj&SOFTcy=to;bn&Sacute=9m;6u&Sc=8fw;1hm&Scaron=9s;70&Scedil=9q;6y&Scirc=9o;6w&Scy=td;bc&Sfr=2knq;1kv&ShortDownArrow=6mr;i7&ShortLeftArrow=6mo;ht&ShortRightArrow=6mq;i2&ShortUpArrow=6mp;hy&Sigma=pv;8u&SmallCircle=6qg;o6&Sopf=2kp6;1m4&Sqrt=6qi;o9&Square=7fl;14t&SquareIntersection=6tv;ww&SquareSubset=6tr;wi&SquareSubsetEqual=6tt;wp&SquareSuperset=6ts;wm&SquareSupersetEqual=6tu;ws&SquareUnion=6tw;wz&Sscr=2kku;1jm&Star=6va;zf&Sub=6vk;zw&Subset=6vk;zv&SubsetEqual=6ti;vu&Succeeds=6t7;uv&SucceedsEqual=8fk;1h4&SucceedsSlantEqual=6t9;v1&SucceedsTilde=6tb;v8&SuchThat=6q3;ni&Sum=6q9;ns&Sup=6vl;zy&Superset=6tf;vp&SupersetEqual=6tj;vx&Supset=6vl;zxÞ=66;3j&TRADE=6jm;gf&TSHcy=sr;ar&TScy=ti;bh&Tab=9;0&Tau=pw;8v&Tcaron=9w;74&Tcedil=9u;72&Tcy=te;bd&Tfr=2knr;1kw&Therefore=6r8;pt&Theta=pk;8k&ThickSpace=6e7,6bu;et&ThinSpace=6bt;d7&Tilde=6rg;q9&TildeEqual=6rn;qs&TildeFullEqual=6rp;qy&TildeTilde=6rs;r4&Topf=2kp7;1m5&TripleDot=6hn;f3&Tscr=2kkv;1jn&Tstrok=9y;76Ú=62;3f&Uarr=6n3;j2&Uarrocir=85l;193&Ubrcy=su;at&Ubreve=a4;7cÛ=63;3g&Ucy=tf;be&Udblac=a8;7g&Ufr=2kns;1kxÙ=61;3e&Umacr=a2;7a&UnderBar=2n;11&UnderBrace=733;13c&UnderBracket=71x;136&UnderParenthesis=731;13a&Union=6v7;z8&UnionPlus=6tq;wf&Uogon=aa;7i&Uopf=2kp8;1m6&UpArrow=6mp;hz&UpArrowBar=842;185&UpArrowDownArrow=6o5;l1&UpDownArrow=6mt;ie&UpEquilibrium=86m;1a2&UpTee=6ud;xv&UpTeeArrow=6n9;jc&Uparrow=6oh;lu&Updownarrow=6ol;m8&UpperLeftArrow=6mu;ih&UpperRightArrow=6mv;ik&Upsi=r6;9z&Upsilon=px;8w&Uring=a6;7e&Uscr=2kkw;1jo&Utilde=a0;78Ü=64;3h&VDash=6uj;y3&Vbar=8h7;1iw&Vcy=sy;ax&Vdash=6uh;y1&Vdashl=8h2;1is&Vee=6v5;z3&Verbar=6c6;dp&Vert=6c6;dq&VerticalBar=6qr;on&VerticalLine=3g;18&VerticalSeparator=7rs;16o&VerticalTilde=6rk;qi&VeryThinSpace=6bu;d9&Vfr=2knt;1ky&Vopf=2kp9;1m7&Vscr=2kkx;1jp&Vvdash=6ui;y2&Wcirc=ac;7k&Wedge=6v4;z0&Wfr=2knu;1kz&Wopf=2kpa;1m8&Wscr=2kky;1jq&Xfr=2knv;1l0&Xi=pq;8q&Xopf=2kpb;1m9&Xscr=2kkz;1jr&YAcy=tr;bq&YIcy=sn;an&YUcy=tq;bpÝ=65;3i&Ycirc=ae;7m&Ycy=tn;bm&Yfr=2knw;1l1&Yopf=2kpc;1ma&Yscr=2kl0;1js&Yuml=ag;7o&ZHcy=t2;b1&Zacute=ah;7p&Zcaron=al;7t&Zcy=t3;b2&Zdot=aj;7r&ZeroWidthSpace=6bv;df&Zeta=pi;8i&Zfr=6js;gl&Zopf=6jo;gi&Zscr=2kl1;1jtá=69;3m&abreve=77;4l&ac=6ri;qg&acE=6ri,mr;qe&acd=6rj;qhâ=6a;3n´=50;28&acy=ts;bræ=6e;3r&af=6e9;ex&afr=2kny;1l2à=68;3l&alefsym=6k5;h3&aleph=6k5;h4&alpha=q9;92&amacr=75;4j&amalg=8cf;1dm&=12;9&and=6qv;p6&andand=8d1;1e3&andd=8d8;1e9&andslope=8d4;1e6&andv=8d6;1e7&ang=6qo;oj&ange=884;1b1&angle=6qo;oi&angmsd=6qp;ol&angmsdaa=888;1b5&angmsdab=889;1b6&angmsdac=88a;1b7&angmsdad=88b;1b8&angmsdae=88c;1b9&angmsdaf=88d;1ba&angmsdag=88e;1bb&angmsdah=88f;1bc&angrt=6qn;og&angrtvb=6v2;yw&angrtvbd=87x;1b0&angsph=6qq;om&angst=5h;2u&angzarr=70c;12z&aogon=79;4n&aopf=2kpe;1mb&ap=6rs;r8&apE=8ds;1ej&apacir=8dr;1eh&ape=6ru;rd&apid=6rv;rf&apos=13;a&approx=6rs;r5&approxeq=6ru;rcå=6d;3q&ascr=2kl2;1ju&ast=16;e&asymp=6rs;r6&asympeq=6rx;rjã=6b;3oä=6c;3p&awconint=6r7;ps&awint=8b5;1cr&bNot=8h9;1iy&backcong=6rw;rg&backepsilon=s6;af&backprime=6d1;ei&backsim=6rh;qc&backsimeq=6vh;zp&barvee=6v1;yv&barwed=6x1;11y&barwedge=6x1;11x&bbrk=71x;137&bbrktbrk=71y;138&bcong=6rw;rh&bcy=tt;bs&bdquo=6ce;e4&becaus=6r9;py&because=6r9;px&bemptyv=88g;1bd&bepsi=s6;ag&bernou=6jw;go&beta=qa;93&beth=6k6;h5&between=6ss;tt&bfr=2knz;1l3&bigcap=6v6;z5&bigcirc=7hr;15s&bigcup=6v7;z7&bigodot=8ao;1cd&bigoplus=8ap;1cf&bigotimes=8aq;1ch&bigsqcup=8au;1cl&bigstar=7id;15z&bigtriangledown=7gd;15e&bigtriangleup=7g3;154&biguplus=8as;1cj&bigvee=6v5;z1&bigwedge=6v4;yy&bkarow=83x;17x&blacklozenge=8a3;1c9&blacksquare=7fu;14x&blacktriangle=7g4;156&blacktriangledown=7ge;15g&blacktriangleleft=7gi;15k&blacktriangleright=7g8;15a&blank=74z;13f&blk12=7f6;14r&blk14=7f5;14q&blk34=7f7;14s&block=7ew;14p&bne=1p,6hx;o&bnequiv=6sh,6hx;sm&bnot=6xc;12d&bopf=2kpf;1mc&bot=6ud;xx&bottom=6ud;xu&bowtie=6vc;zi&boxDL=7dj;141&boxDR=7dg;13y&boxDl=7di;140&boxDr=7df;13x&boxH=7dc;13u&boxHD=7dy;14g&boxHU=7e1;14j&boxHd=7dw;14e&boxHu=7dz;14h&boxUL=7dp;147&boxUR=7dm;144&boxUl=7do;146&boxUr=7dl;143&boxV=7dd;13v&boxVH=7e4;14m&boxVL=7dv;14d&boxVR=7ds;14a&boxVh=7e3;14l&boxVl=7du;14c&boxVr=7dr;149&boxbox=895;1bw&boxdL=7dh;13z&boxdR=7de;13w&boxdl=7bk;13m&boxdr=7bg;13l&boxh=7b4;13j&boxhD=7dx;14f&boxhU=7e0;14i&boxhd=7cc;13r&boxhu=7ck;13s&boxminus=6u7;xi&boxplus=6u6;xg&boxtimes=6u8;xk&boxuL=7dn;145&boxuR=7dk;142&boxul=7bs;13o&boxur=7bo;13n&boxv=7b6;13k&boxvH=7e2;14k&boxvL=7dt;14b&boxvR=7dq;148&boxvh=7cs;13t&boxvl=7c4;13q&boxvr=7bw;13p&bprime=6d1;ej&breve=k8;83¦=4m;1k&bscr=2kl3;1jv&bsemi=6dr;er&bsim=6rh;qd&bsime=6vh;zq&bsol=2k;x&bsolb=891;1bv&bsolhsub=7uw;16r&bull=6ci;e9&bullet=6ci;e8&bump=6ry;rp&bumpE=8fi;1gu&bumpe=6rz;ru&bumpeq=6rz;rt&cacute=7b;4p&cap=6qx;pa&capand=8ck;1dq&capbrcup=8cp;1dv&capcap=8cr;1dx&capcup=8cn;1dt&capdot=8cg;1dn&caps=6qx,1e68;p9&caret=6dd;eo&caron=jr;81&ccaps=8ct;1dz&ccaron=7h;4vç=6f;3s&ccirc=7d;4r&ccups=8cs;1dy&ccupssm=8cw;1e0&cdot=7f;4t¸=54;2f&cemptyv=88i;1bf¢=4i;1g¢erdot=53;2c&cfr=2ko0;1l4&chcy=uf;ce&check=7pv;16j&checkmark=7pv;16i&chi=qv;9s&cir=7gr;15q&cirE=88z;1bt&circ=jq;7z&circeq=6s7;sc&circlearrowleft=6nu;k6&circlearrowright=6nv;k8&circledR=4u;1w&circledS=79k;13g&circledast=6u3;xc&circledcirc=6u2;xa&circleddash=6u5;xe&cire=6s7;sd&cirfnint=8b4;1cq&cirmid=8hb;1j0&cirscir=88y;1bs&clubs=7kz;168&clubsuit=7kz;167&colon=1m;j&colone=6s4;s7&coloneq=6s4;s5&comma=18;g&commat=1s;u&comp=6pt;mv&compfn=6qg;o7&complement=6pt;mu&complexes=6iq;f6&cong=6rp;qz&congdot=8dp;1ef&conint=6r2;pj&copf=2kpg;1md&coprod=6q8;nr©=4p;1r©sr=6jb;fz&crarr=6np;k1&cross=7pz;16k&cscr=2kl4;1jw&csub=8gf;1id&csube=8gh;1if&csup=8gg;1ie&csupe=8gi;1ig&ctdot=6wf;11g&cudarrl=854;18x&cudarrr=851;18u&cuepr=6vy;10m&cuesc=6vz;10o&cularr=6nq;k3&cularrp=859;190&cup=6qy;pc&cupbrcap=8co;1du&cupcap=8cm;1ds&cupcup=8cq;1dw&cupdot=6tp;we&cupor=8cl;1dr&cups=6qy,1e68;pb&curarr=6nr;k5&curarrm=858;18z&curlyeqprec=6vy;10l&curlyeqsucc=6vz;10n&curlyvee=6vi;zr&curlywedge=6vj;zt¤=4k;1i&curvearrowleft=6nq;k2&curvearrowright=6nr;k4&cuvee=6vi;zs&cuwed=6vj;zu&cwconint=6r6;pq&cwint=6r5;po&cylcty=6y5;12u&dArr=6oj;m2&dHar=86d;19t&dagger=6cg;e5&daleth=6k8;h7&darr=6mr;ia&dash=6c0;dl&dashv=6ub;xr&dbkarow=83z;180&dblac=kd;8b&dcaron=7j;4x&dcy=tw;bv&dd=6km;hb&ddagger=6ch;e6&ddarr=6oa;ld&ddotseq=8dz;1ep°=4w;21&delta=qc;95&demptyv=88h;1be&dfisht=873;1aj&dfr=2ko1;1l5&dharl=6o3;kx&dharr=6o2;ku&diam=6v8;zc&diamond=6v8;zb&diamondsuit=7l2;16b&diams=7l2;16c&die=4o;1o&digamma=rh;a6&disin=6wi;11j&div=6v;49÷=6v;48÷ontimes=6vb;zg&divonx=6vb;zh&djcy=uq;co&dlcorn=6xq;12n&dlcrop=6x9;12a&dollar=10;6&dopf=2kph;1me&dot=k9;85&doteq=6s0;rx&doteqdot=6s1;rz&dotminus=6rc;q2&dotplus=6qc;ny&dotsquare=6u9;xm&doublebarwedge=6x2;11z&downarrow=6mr;i9&downdownarrows=6oa;lc&downharpoonleft=6o3;kv&downharpoonright=6o2;ks&drbkarow=840;182&drcorn=6xr;12p&drcrop=6x8;129&dscr=2kl5;1jx&dscy=ut;cr&dsol=8ae;1cc&dstrok=7l;4z&dtdot=6wh;11i&dtri=7gf;15j&dtrif=7ge;15h&duarr=6ph;mo&duhar=86n;1a5&dwangle=886;1b3&dzcy=v3;d0&dzigrarr=7wf;17r&eDDot=8dz;1eq&eDot=6s1;s0é=6h;3u&easter=8dq;1eg&ecaron=7v;57&ecir=6s6;sbê=6i;3v&ecolon=6s5;s9&ecy=ul;ck&edot=7r;53&ee=6kn;he&efDot=6s2;s2&efr=2ko2;1l6&eg=8ey;1g9è=6g;3t&egs=8eu;1g5&egsdot=8ew;1g7&el=8ex;1g8&elinters=73b;13e&ell=6j7;fv&els=8et;1g3&elsdot=8ev;1g6&emacr=7n;51&empty=6px;n7&emptyset=6px;n5&emptyv=6px;n6&emsp=6bn;d2&emsp13=6bo;d3&emsp14=6bp;d4&eng=97;6h&ensp=6bm;d1&eogon=7t;55&eopf=2kpi;1mf&epar=6vp;103&eparsl=89v;1c6&eplus=8dt;1ek&epsi=qd;97&epsilon=qd;96&epsiv=s5;ae&eqcirc=6s6;sa&eqcolon=6s5;s8&eqsim=6rm;qq&eqslantgtr=8eu;1g4&eqslantless=8et;1g2&equals=1p;p&equest=6sf;sj&equiv=6sh;so&equivDD=8e0;1er&eqvparsl=89x;1c8&erDot=6s3;s4&erarr=86p;1a7&escr=6jz;gs&esdot=6s0;ry&esim=6rm;qr&eta=qf;99ð=6o;41ë=6j;3w&euro=6gc;f2&excl=x;2&exist=6pv;n0&expectation=6k0;gt&exponentiale=6kn;hd&fallingdotseq=6s2;s1&fcy=uc;cb&female=7k0;163&ffilig=1dkz;1ja&fflig=1dkw;1j7&ffllig=1dl0;1jb&ffr=2ko3;1l7&filig=1dkx;1j8&fjlig=2u,2y;15&flat=7l9;16e&fllig=1dky;1j9&fltns=7g1;153&fnof=b6;7v&fopf=2kpj;1mg&forall=6ps;mt&fork=6vo;102&forkv=8gp;1in&fpartint=8b1;1cp½=59;2k&frac13=6kz;hh¼=58;2j&frac15=6l1;hj&frac16=6l5;hn&frac18=6l7;hp&frac23=6l0;hi&frac25=6l2;hk¾=5a;2m&frac35=6l3;hl&frac38=6l8;hq&frac45=6l4;hm&frac56=6l6;ho&frac58=6l9;hr&frac78=6la;hs&frasl=6dg;eq&frown=6xu;12r&fscr=2kl7;1jy&gE=6sn;t8&gEl=8ek;1ft&gacute=dx;7x&gamma=qb;94&gammad=rh;a7&gap=8ee;1fh&gbreve=7z;5b&gcirc=7x;59&gcy=tv;bu&gdot=81;5d&ge=6sl;sx&gel=6vv;10k&geq=6sl;sw&geqq=6sn;t7&geqslant=8e6;1f6&ges=8e6;1f7&gescc=8fd;1gn&gesdot=8e8;1f9&gesdoto=8ea;1fb&gesdotol=8ec;1fd&gesl=6vv,1e68;10h&gesles=8es;1g1&gfr=2ko4;1l8&gg=6sr;ts&ggg=6vt;10b&gimel=6k7;h6&gjcy=ur;cp&gl=6t3;un&glE=8eq;1fz&gla=8f9;1gj&glj=8f8;1gi&gnE=6sp;tg&gnap=8ei;1fp&gnapprox=8ei;1fo&gne=8eg;1fl&gneq=8eg;1fk&gneqq=6sp;tf&gnsim=6w7;10y&gopf=2kpk;1mh&grave=2o;14&gscr=6iy;f9&gsim=6sz;ud&gsime=8em;1fv&gsiml=8eo;1fx>=1q;s>cc=8fb;1gl>cir=8e2;1et>dot=6vr;107>lPar=87p;1aw>quest=8e4;1ev>rapprox=8ee;1fg>rarr=86w;1ad>rdot=6vr;106>reqless=6vv;10j>reqqless=8ek;1fs>rless=6t3;um>rsim=6sz;uc&gvertneqq=6sp,1e68;td&gvnE=6sp,1e68;te&hArr=6ok;m5&hairsp=6bu;da&half=59;2l&hamilt=6iz;fb&hardcy=ui;ch&harr=6ms;id&harrcir=85k;192&harrw=6nh;js&hbar=6j3;fl&hcirc=85;5g&hearts=7l1;16a&heartsuit=7l1;169&hellip=6cm;eb&hercon=6ux;yr&hfr=2ko5;1l9&hksearow=84l;18i&hkswarow=84m;18k&hoarr=6pr;mr&homtht=6rf;q5&hookleftarrow=6nd;jj&hookrightarrow=6ne;jl&hopf=2kpl;1mi&horbar=6c5;do&hscr=2kl9;1jz&hslash=6j3;fi&hstrok=87;5i&hybull=6df;ep&hyphen=6c0;dkí=6l;3y&ic=6eb;f1î=6m;3z&icy=u0;bz&iecy=tx;bw¡=4h;1f&iff=6ok;m6&ifr=2ko6;1laì=6k;3x&ii=6ko;hg&iiiint=8b0;1cn&iiint=6r1;pg&iinfin=89o;1c3&iiota=6jt;gm&ijlig=8j;5t&imacr=8b;5m&image=6j5;fp&imagline=6j4;fm&imagpart=6j5;fo&imath=8h;5r&imof=6uv;yo&imped=c5;7w&in=6q0;nd&incare=6it;f8&infin=6qm;of&infintie=89p;1c4&inodot=8h;5q&int=6qz;pe&intcal=6uy;yt&integers=6jo;gh&intercal=6uy;ys&intlarhk=8bb;1cx&intprod=8cc;1dk&iocy=up;cn&iogon=8f;5o&iopf=2kpm;1mj&iota=qh;9b&iprod=8cc;1dl¿=5b;2n&iscr=2kla;1k0&isin=6q0;nc&isinE=6wp;11r&isindot=6wl;11n&isins=6wk;11l&isinsv=6wj;11k&isinv=6q0;nb&it=6ea;ez&itilde=89;5k&iukcy=uu;csï=6n;40&jcirc=8l;5v&jcy=u1;c0&jfr=2ko7;1lb&jmath=fr;7y&jopf=2kpn;1mk&jscr=2klb;1k1&jsercy=uw;cu&jukcy=us;cq&kappa=qi;9c&kappav=s0;a9&kcedil=8n;5x&kcy=u2;c1&kfr=2ko8;1lc&kgreen=8o;5y&khcy=ud;cc&kjcy=v0;cy&kopf=2kpo;1ml&kscr=2klc;1k2&lAarr=6oq;mf&lArr=6og;ls&lAtail=84b;18a&lBarr=83y;17z&lE=6sm;t2&lEg=8ej;1fr&lHar=86a;19q&lacute=8q;60&laemptyv=88k;1bh&lagran=6j6;ft&lambda=qj;9d&lang=7vs;16z&langd=87l;1as&langle=7vs;16y&lap=8ed;1ff«=4r;1t&larr=6mo;hx&larrb=6p0;mk&larrbfs=84f;18e&larrfs=84d;18c&larrhk=6nd;jk&larrlp=6nf;jo&larrpl=855;18y&larrsim=86r;1a9&larrtl=6n6;j7&lat=8ff;1gp&latail=849;188&late=8fh;1gt&lates=8fh,1e68;1gs&lbarr=83w;17w&lbbrk=7si;16p&lbrace=3f;16&lbrack=2j;v&lbrke=87f;1am&lbrksld=87j;1aq&lbrkslu=87h;1ao&lcaron=8u;64&lcedil=8s;62&lceil=6x4;122&lcub=3f;17&lcy=u3;c2&ldca=852;18v&ldquo=6cc;dz&ldquor=6ce;e3&ldrdhar=86f;19v&ldrushar=85n;195&ldsh=6nm;jz&le=6sk;st&leftarrow=6mo;hv&leftarrowtail=6n6;j6&leftharpoondown=6nx;kd&leftharpoonup=6nw;ka&leftleftarrows=6o7;l6&leftrightarrow=6ms;ic&leftrightarrows=6o6;l4&leftrightharpoons=6ob;lf&leftrightsquigarrow=6nh;jr&leftthreetimes=6vf;zl&leg=6vu;10g&leq=6sk;ss&leqq=6sm;t1&leqslant=8e5;1f0&les=8e5;1f1&lescc=8fc;1gm&lesdot=8e7;1f8&lesdoto=8e9;1fa&lesdotor=8eb;1fc&lesg=6vu,1e68;10d&lesges=8er;1g0&lessapprox=8ed;1fe&lessdot=6vq;104&lesseqgtr=6vu;10f&lesseqqgtr=8ej;1fq&lessgtr=6t2;uj&lesssim=6sy;u9&lfisht=870;1ag&lfloor=6x6;126&lfr=2ko9;1ld&lg=6t2;uk&lgE=8ep;1fy&lhard=6nx;kf&lharu=6nw;kc&lharul=86i;19y&lhblk=7es;14o&ljcy=ux;cv&ll=6sq;tm&llarr=6o7;l7&llcorner=6xq;12m&llhard=86j;19z&lltri=7i2;15w&lmidot=8w;66&lmoust=71s;131&lmoustache=71s;130&lnE=6so;tc&lnap=8eh;1fn&lnapprox=8eh;1fm&lne=8ef;1fj&lneq=8ef;1fi&lneqq=6so;tb&lnsim=6w6;10x&loang=7vw;175&loarr=6pp;mp&lobrk=7vq;16u&longleftarrow=7w5;178&longleftrightarrow=7w7;17e&longmapsto=7wc;17p&longrightarrow=7w6;17b&looparrowleft=6nf;jn&looparrowright=6ng;jp&lopar=879;1ak&lopf=2kpp;1mm&loplus=8bx;1d6&lotimes=8c4;1dc&lowast=6qf;o5&lowbar=2n;12&loz=7gq;15p&lozenge=7gq;15o&lozf=8a3;1ca&lpar=14;b&lparlt=87n;1au&lrarr=6o6;l5&lrcorner=6xr;12o&lrhar=6ob;lg&lrhard=86l;1a1&lrm=6by;di&lrtri=6v3;yx&lsaquo=6d5;ek&lscr=2kld;1k3&lsh=6nk;jw&lsim=6sy;ua&lsime=8el;1fu&lsimg=8en;1fw&lsqb=2j;w&lsquo=6c8;ds&lsquor=6ca;dw&lstrok=8y;68<=1o;n<cc=8fa;1gk<cir=8e1;1es<dot=6vq;105<hree=6vf;zm<imes=6vd;zj<larr=86u;1ac<quest=8e3;1eu<rPar=87q;1ax<ri=7gj;15n<rie=6us;yi<rif=7gi;15l&lurdshar=85m;194&luruhar=86e;19u&lvertneqq=6so,1e68;t9&lvnE=6so,1e68;ta&mDDot=6re;q4¯=4v;20&male=7k2;164&malt=7q8;16m&maltese=7q8;16l&map=6na;jg&mapsto=6na;jf&mapstodown=6nb;ji&mapstoleft=6n8;jb&mapstoup=6n9;jd&marker=7fy;152&mcomma=8bt;1d4&mcy=u4;c3&mdash=6c4;dn&measuredangle=6qp;ok&mfr=2koa;1le&mho=6jr;gjµ=51;29&mid=6qr;oq&midast=16;d&midcir=8hc;1j1·=53;2d&minus=6qa;nu&minusb=6u7;xj&minusd=6rc;q3&minusdu=8bu;1d5&mlcp=8gr;1ip&mldr=6cm;ec&mnplus=6qb;nw&models=6uf;xy&mopf=2kpq;1mn&mp=6qb;nx&mscr=2kle;1k4&mstpos=6ri;qf&mu=qk;9e&multimap=6uw;yp&mumap=6uw;yq&nGg=6vt,mw;10a&nGt=6sr,6he;tp&nGtv=6sr,mw;to&nLeftarrow=6od;lk&nLeftrightarrow=6oe;lm&nLl=6vs,mw;108&nLt=6sq,6he;tj&nLtv=6sq,mw;ti&nRightarrow=6of;lo&nVDash=6un;y7&nVdash=6um;y6&nabla=6pz;n8&nacute=90;6a&nang=6qo,6he;oh&nap=6rt;rb&napE=8ds,mw;1ei&napid=6rv,mw;re&napos=95;6f&napprox=6rt;ra&natur=7la;16g&natural=7la;16f&naturals=6j9;fw =4g;1e&nbump=6ry,mw;rm&nbumpe=6rz,mw;rr&ncap=8cj;1dp&ncaron=94;6e&ncedil=92;6c&ncong=6rr;r2&ncongdot=8dp,mw;1ee&ncup=8ci;1do&ncy=u5;c4&ndash=6c3;dm&ne=6sg;sl&neArr=6on;mb&nearhk=84k;18h&nearr=6mv;im&nearrow=6mv;il&nedot=6s0,mw;rv&nequiv=6si;sq&nesear=84o;18n&nesim=6rm,mw;qo&nexist=6pw;n3&nexists=6pw;n2&nfr=2kob;1lf&ngE=6sn,mw;t4&nge=6sx;u7&ngeq=6sx;u6&ngeqq=6sn,mw;t5&ngeqslant=8e6,mw;1f3&nges=8e6,mw;1f4&ngsim=6t1;uh&ngt=6sv;u1&ngtr=6sv;u0&nhArr=6oe;ln&nharr=6ni;ju&nhpar=8he;1j3&ni=6q3;nk&nis=6ws;11u&nisd=6wq;11s&niv=6q3;nj&njcy=uy;cw&nlArr=6od;ll&nlE=6sm,mw;sy&nlarr=6my;iu&nldr=6cl;ea&nle=6sw;u4&nleftarrow=6my;it&nleftrightarrow=6ni;jt&nleq=6sw;u3&nleqq=6sm,mw;sz&nleqslant=8e5,mw;1ex&nles=8e5,mw;1ey&nless=6su;tx&nlsim=6t0;uf&nlt=6su;ty&nltri=6wa;115&nltrie=6wc;11b&nmid=6qs;ou&nopf=2kpr;1mo¬=4s;1u¬in=6q1;ng¬inE=6wp,mw;11q¬indot=6wl,mw;11m¬inva=6q1;nf¬invb=6wn;11p¬invc=6wm;11o¬ni=6q4;nn¬niva=6q4;nm¬nivb=6wu;11w¬nivc=6wt;11v&npar=6qu;p4&nparallel=6qu;p2&nparsl=8hp,6hx;1j5&npart=6pu,mw;mw&npolint=8b8;1cu&npr=6tc;vd&nprcue=6w0;10q&npre=8fj,mw;1gw&nprec=6tc;vc&npreceq=8fj,mw;1gx&nrArr=6of;lp&nrarr=6mz;iw&nrarrc=84z,mw;18s&nrarrw=6n1,mw;ix&nrightarrow=6mz;iv&nrtri=6wb;118&nrtrie=6wd;11e&nsc=6td;vg&nsccue=6w1;10s&nsce=8fk,mw;1h2&nscr=2klf;1k5&nshortmid=6qs;os&nshortparallel=6qu;p1&nsim=6rl;qm&nsime=6ro;qx&nsimeq=6ro;qw&nsmid=6qs;ot&nspar=6qu;p3&nsqsube=6w2;10u&nsqsupe=6w3;10w&nsub=6tg;vs&nsubE=8g5,mw;1hv&nsube=6tk;w2&nsubset=6te,6he;vi&nsubseteq=6tk;w1&nsubseteqq=8g5,mw;1hw&nsucc=6td;vf&nsucceq=8fk,mw;1h3&nsup=6th;vt&nsupE=8g6,mw;1hz&nsupe=6tl;w5&nsupset=6tf,6he;vn&nsupseteq=6tl;w4&nsupseteqq=8g6,mw;1i0&ntgl=6t5;urñ=6p;42&ntlg=6t4;up&ntriangleleft=6wa;114&ntrianglelefteq=6wc;11a&ntriangleright=6wb;117&ntrianglerighteq=6wd;11d&nu=ql;9f&num=z;5&numero=6ja;fy&numsp=6br;d5&nvDash=6ul;y5&nvHarr=83o;17u&nvap=6rx,6he;ri&nvdash=6uk;y4&nvge=6sl,6he;su&nvgt=1q,6he;q&nvinfin=89q;1c5&nvlArr=83m;17s&nvle=6sk,6he;sr&nvlt=1o,6he;l&nvltrie=6us,6he;yf&nvrArr=83n;17t&nvrtrie=6ut,6he;yj&nvsim=6rg,6he;q6&nwArr=6om;ma&nwarhk=84j;18g&nwarr=6mu;ij&nwarrow=6mu;ii&nwnear=84n;18m&oS=79k;13hó=6r;44&oast=6u3;xd&ocir=6u2;xbô=6s;45&ocy=u6;c5&odash=6u5;xf&odblac=9d;6l&odiv=8c8;1dg&odot=6u1;x9&odsold=88s;1bn&oelig=9f;6n&ofcir=88v;1bp&ofr=2koc;1lg&ogon=kb;87ò=6q;43&ogt=88x;1br&ohbar=88l;1bi&ohm=q1;91&oint=6r2;pk&olarr=6nu;k7&olcir=88u;1bo&olcross=88r;1bm&oline=6da;en&olt=88w;1bq&omacr=99;6j&omega=qx;9u&omicron=qn;9h&omid=88m;1bj&ominus=6ty;x4&oopf=2kps;1mp&opar=88n;1bk&operp=88p;1bl&oplus=6tx;x2&or=6qw;p8&orarr=6nv;k9&ord=8d9;1ea&order=6k4;h1&orderof=6k4;h0ª=4q;1sº=56;2h&origof=6uu;yn&oror=8d2;1e4&orslope=8d3;1e5&orv=8d7;1e8&oscr=6k4;h2ø=6w;4a&osol=6u0;x7õ=6t;46&otimes=6tz;x6&otimesas=8c6;1deö=6u;47&ovbar=6yl;12x&par=6qt;oz¶=52;2a¶llel=6qt;ox&parsim=8hf;1j4&parsl=8hp;1j6&part=6pu;my&pcy=u7;c6&percnt=11;7&period=1a;h&permil=6cw;ed&perp=6ud;xw&pertenk=6cx;ee&pfr=2kod;1lh&phi=qu;9r&phiv=r9;a2&phmmat=6k3;gy&phone=7im;162&pi=qo;9i&pitchfork=6vo;101&piv=ra;a4&planck=6j3;fj&planckh=6j2;fh&plankv=6j3;fk&plus=17;f&plusacir=8bn;1cz&plusb=6u6;xh&pluscir=8bm;1cy&plusdo=6qc;nz&plusdu=8bp;1d1&pluse=8du;1el±=4x;23&plussim=8bq;1d2&plustwo=8br;1d3&pm=4x;24&pointint=8b9;1cv&popf=2kpt;1mq£=4j;1h&pr=6t6;uu&prE=8fn;1h7&prap=8fr;1he&prcue=6t8;v0&pre=8fj;1h0&prec=6t6;ut&precapprox=8fr;1hd&preccurlyeq=6t8;uz&preceq=8fj;1gz&precnapprox=8ft;1hh&precneqq=8fp;1h9&precnsim=6w8;10z&precsim=6ta;v5&prime=6cy;ef&primes=6jd;g2&prnE=8fp;1ha&prnap=8ft;1hi&prnsim=6w8;110&prod=6q7;np&profalar=6y6;12v&profline=6xe;12e&profsurf=6xf;12f&prop=6ql;oe&propto=6ql;oc&prsim=6ta;v6&prurel=6uo;y8&pscr=2klh;1k6&psi=qw;9t&puncsp=6bs;d6&qfr=2koe;1li&qint=8b0;1co&qopf=2kpu;1mr&qprime=6dz;es&qscr=2kli;1k7&quaternions=6j1;ff&quatint=8ba;1cw&quest=1r;t&questeq=6sf;si"=y;4&rAarr=6or;mh&rArr=6oi;lz&rAtail=84c;18b&rBarr=83z;181&rHar=86c;19s&race=6rh,mp;qb&racute=9h;6p&radic=6qi;o8&raemptyv=88j;1bg&rang=7vt;172&rangd=87m;1at&range=885;1b2&rangle=7vt;171»=57;2i&rarr=6mq;i6&rarrap=86t;1ab&rarrb=6p1;mm&rarrbfs=84g;18f&rarrc=84z;18t&rarrfs=84e;18d&rarrhk=6ne;jm&rarrlp=6ng;jq&rarrpl=85h;191&rarrsim=86s;1aa&rarrtl=6n7;j9&rarrw=6n1;iz&ratail=84a;189&ratio=6ra;pz&rationals=6je;g4&rbarr=83x;17y&rbbrk=7sj;16q&rbrace=3h;1b&rbrack=2l;y&rbrke=87g;1an&rbrksld=87i;1ap&rbrkslu=87k;1ar&rcaron=9l;6t&rcedil=9j;6r&rceil=6x5;124&rcub=3h;1c&rcy=u8;c7&rdca=853;18w&rdldhar=86h;19x&rdquo=6cd;e2&rdquor=6cd;e1&rdsh=6nn;k0&real=6jg;g9&realine=6jf;g6&realpart=6jg;g8&reals=6jh;gc&rect=7fx;151®=4u;1y&rfisht=871;1ah&rfloor=6x7;128&rfr=2kof;1lj&rhard=6o1;kr&rharu=6o0;ko&rharul=86k;1a0&rho=qp;9j&rhov=s1;ab&rightarrow=6mq;i4&rightarrowtail=6n7;j8&rightharpoondown=6o1;kp&rightharpoonup=6o0;km&rightleftarrows=6o4;kz&rightleftharpoons=6oc;lh&rightrightarrows=6o9;la&rightsquigarrow=6n1;iy&rightthreetimes=6vg;zn&ring=ka;86&risingdotseq=6s3;s3&rlarr=6o4;l0&rlhar=6oc;lj&rlm=6bz;dj&rmoust=71t;133&rmoustache=71t;132&rnmid=8ha;1iz&roang=7vx;176&roarr=6pq;mq&robrk=7vr;16w&ropar=87a;1al&ropf=2kpv;1ms&roplus=8by;1d7&rotimes=8c5;1dd&rpar=15;c&rpargt=87o;1av&rppolint=8b6;1cs&rrarr=6o9;lb&rsaquo=6d6;el&rscr=2klj;1k8&rsh=6nl;jy&rsqb=2l;z&rsquo=6c9;dv&rsquor=6c9;du&rthree=6vg;zo&rtimes=6ve;zk&rtri=7g9;15d&rtrie=6ut;ym&rtrif=7g8;15b&rtriltri=89a;1by&ruluhar=86g;19w&rx=6ji;ge&sacute=9n;6v&sbquo=6ca;dx&sc=6t7;ux&scE=8fo;1h8&scap=8fs;1hg&scaron=9t;71&sccue=6t9;v3&sce=8fk;1h6&scedil=9r;6z&scirc=9p;6x&scnE=8fq;1hc&scnap=8fu;1hk&scnsim=6w9;112&scpolint=8b7;1ct&scsim=6tb;va&scy=u9;c8&sdot=6v9;zd&sdotb=6u9;xn&sdote=8di;1ec&seArr=6oo;mc&searhk=84l;18j&searr=6mw;ip&searrow=6mw;io§=4n;1l&semi=1n;k&seswar=84p;18p&setminus=6qe;o2&setmn=6qe;o4&sext=7qu;16n&sfr=2kog;1lk&sfrown=6xu;12q&sharp=7lb;16h&shchcy=uh;cg&shcy=ug;cf&shortmid=6qr;oo&shortparallel=6qt;ow­=4t;1v&sigma=qr;9n&sigmaf=qq;9l&sigmav=qq;9m&sim=6rg;qa&simdot=8dm;1ed&sime=6rn;qu&simeq=6rn;qt&simg=8f2;1gb&simgE=8f4;1gd&siml=8f1;1ga&simlE=8f3;1gc&simne=6rq;r0&simplus=8bo;1d0&simrarr=86q;1a8&slarr=6mo;hw&smallsetminus=6qe;o0&smashp=8c3;1db&smeparsl=89w;1c7&smid=6qr;op&smile=6xv;12t&smt=8fe;1go&smte=8fg;1gr&smtes=8fg,1e68;1gq&softcy=uk;cj&sol=1b;i&solb=890;1bu&solbar=6yn;12y&sopf=2kpw;1mt&spades=7kw;166&spadesuit=7kw;165&spar=6qt;oy&sqcap=6tv;wx&sqcaps=6tv,1e68;wv&sqcup=6tw;x0&sqcups=6tw,1e68;wy&sqsub=6tr;wk&sqsube=6tt;wr&sqsubset=6tr;wj&sqsubseteq=6tt;wq&sqsup=6ts;wo&sqsupe=6tu;wu&sqsupset=6ts;wn&sqsupseteq=6tu;wt&squ=7fl;14v&square=7fl;14u&squarf=7fu;14y&squf=7fu;14z&srarr=6mq;i5&sscr=2klk;1k9&ssetmn=6qe;o3&ssmile=6xv;12s&sstarf=6va;ze&star=7ie;161&starf=7id;160&straightepsilon=s5;ac&straightphi=r9;a0&strns=4v;1z&sub=6te;vl&subE=8g5;1hy&subdot=8fx;1hn&sube=6ti;vw&subedot=8g3;1ht&submult=8g1;1hr&subnE=8gb;1i8&subne=6tm;w9&subplus=8fz;1hp&subrarr=86x;1ae&subset=6te;vk&subseteq=6ti;vv&subseteqq=8g5;1hx&subsetneq=6tm;w8&subsetneqq=8gb;1i7&subsim=8g7;1i3&subsub=8gl;1ij&subsup=8gj;1ih&succ=6t7;uw&succapprox=8fs;1hf&succcurlyeq=6t9;v2&succeq=8fk;1h5&succnapprox=8fu;1hj&succneqq=8fq;1hb&succnsim=6w9;111&succsim=6tb;v9&sum=6q9;nt&sung=7l6;16d&sup=6tf;vr¹=55;2g²=4y;25³=4z;26&supE=8g6;1i2&supdot=8fy;1ho&supdsub=8go;1im&supe=6tj;vz&supedot=8g4;1hu&suphsol=7ux;16s&suphsub=8gn;1il&suplarr=86z;1af&supmult=8g2;1hs&supnE=8gc;1ic&supne=6tn;wd&supplus=8g0;1hq&supset=6tf;vq&supseteq=6tj;vy&supseteqq=8g6;1i1&supsetneq=6tn;wc&supsetneqq=8gc;1ib&supsim=8g8;1i4&supsub=8gk;1ii&supsup=8gm;1ik&swArr=6op;md&swarhk=84m;18l&swarr=6mx;is&swarrow=6mx;ir&swnwar=84q;18rß=67;3k&target=6xi;12h&tau=qs;9o&tbrk=71w;135&tcaron=9x;75&tcedil=9v;73&tcy=ua;c9&tdot=6hn;f4&telrec=6xh;12g&tfr=2koh;1ll&there4=6r8;pv&therefore=6r8;pu&theta=qg;9a&thetasym=r5;9v&thetav=r5;9x&thickapprox=6rs;r3&thicksim=6rg;q7&thinsp=6bt;d8&thkap=6rs;r7&thksim=6rg;q8þ=72;4g&tilde=kc;89×=5z;3c×b=6u8;xl×bar=8c1;1da×d=8c0;1d9&tint=6r1;ph&toea=84o;18o&top=6uc;xt&topbot=6ye;12w&topcir=8hd;1j2&topf=2kpx;1mu&topfork=8gq;1io&tosa=84p;18q&tprime=6d0;eh&trade=6jm;gg&triangle=7g5;158&triangledown=7gf;15i&triangleleft=7gj;15m&trianglelefteq=6us;yh&triangleq=6sc;sg&triangleright=7g9;15c&trianglerighteq=6ut;yl&tridot=7ho;15r&trie=6sc;sh&triminus=8ca;1di&triplus=8c9;1dh&trisb=899;1bx&tritime=8cb;1dj&trpezium=736;13d&tscr=2kll;1ka&tscy=ue;cd&tshcy=uz;cx&tstrok=9z;77&twixt=6ss;tu&twoheadleftarrow=6n2;j0&twoheadrightarrow=6n4;j3&uArr=6oh;lv&uHar=86b;19rú=6y;4c&uarr=6mp;i1&ubrcy=v2;cz&ubreve=a5;7dû=6z;4d&ucy=ub;ca&udarr=6o5;l2&udblac=a9;7h&udhar=86m;1a3&ufisht=872;1ai&ufr=2koi;1lmù=6x;4b&uharl=6nz;kl&uharr=6ny;ki&uhblk=7eo;14n&ulcorn=6xo;12j&ulcorner=6xo;12i&ulcrop=6xb;12c&ultri=7i0;15u&umacr=a3;7b¨=4o;1p&uogon=ab;7j&uopf=2kpy;1mv&uparrow=6mp;i0&updownarrow=6mt;if&upharpoonleft=6nz;kj&upharpoonright=6ny;kg&uplus=6tq;wg&upsi=qt;9q&upsih=r6;9y&upsilon=qt;9p&upuparrows=6o8;l8&urcorn=6xp;12l&urcorner=6xp;12k&urcrop=6xa;12b&uring=a7;7f&urtri=7i1;15v&uscr=2klm;1kb&utdot=6wg;11h&utilde=a1;79&utri=7g5;159&utrif=7g4;157&uuarr=6o8;l9ü=70;4e&uwangle=887;1b4&vArr=6ol;m9&vBar=8h4;1iu&vBarv=8h5;1iv&vDash=6ug;y0&vangrt=87w;1az&varepsilon=s5;ad&varkappa=s0;a8&varnothing=6px;n4&varphi=r9;a1&varpi=ra;a3&varpropto=6ql;ob&varr=6mt;ig&varrho=s1;aa&varsigma=qq;9k&varsubsetneq=6tm,1e68;w6&varsubsetneqq=8gb,1e68;1i5&varsupsetneq=6tn,1e68;wa&varsupsetneqq=8gc,1e68;1i9&vartheta=r5;9w&vartriangleleft=6uq;y9&vartriangleright=6ur;yc&vcy=tu;bt&vdash=6ua;xp&vee=6qw;p7&veebar=6uz;yu&veeeq=6sa;sf&vellip=6we;11f&verbar=3g;19&vert=3g;1a&vfr=2koj;1ln&vltri=6uq;yb&vnsub=6te,6he;vj&vnsup=6tf,6he;vo&vopf=2kpz;1mw&vprop=6ql;od&vrtri=6ur;ye&vscr=2kln;1kc&vsubnE=8gb,1e68;1i6&vsubne=6tm,1e68;w7&vsupnE=8gc,1e68;1ia&vsupne=6tn,1e68;wb&vzigzag=87u;1ay&wcirc=ad;7l&wedbar=8db;1eb&wedge=6qv;p5&wedgeq=6s9;se&weierp=6jc;g0&wfr=2kok;1lo&wopf=2kq0;1mx&wp=6jc;g1&wr=6rk;qk&wreath=6rk;qj&wscr=2klo;1kd&xcap=6v6;z6&xcirc=7hr;15t&xcup=6v7;z9&xdtri=7gd;15f&xfr=2kol;1lp&xhArr=7wa;17o&xharr=7w7;17f&xi=qm;9g&xlArr=7w8;17i&xlarr=7w5;179&xmap=7wc;17q&xnis=6wr;11t&xodot=8ao;1ce&xopf=2kq1;1my&xoplus=8ap;1cg&xotime=8aq;1ci&xrArr=7w9;17l&xrarr=7w6;17c&xscr=2klp;1ke&xsqcup=8au;1cm&xuplus=8as;1ck&xutri=7g3;155&xvee=6v5;z2&xwedge=6v4;yzý=71;4f&yacy=un;cm&ycirc=af;7n&ycy=uj;ci¥=4l;1j&yfr=2kom;1lq&yicy=uv;ct&yopf=2kq2;1mz&yscr=2klq;1kf&yucy=um;clÿ=73;4h&zacute=ai;7q&zcaron=am;7u&zcy=tz;by&zdot=ak;7s&zeetrf=6js;gk&zeta=qe;98&zfr=2kon;1lr&zhcy=ty;bx&zigrarr=6ot;mi&zopf=2kq3;1n0&zscr=2klr;1kg&zwj=6bx;dh&zwnj=6bw;dg&";
+}
+
diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/CharacterReader.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/CharacterReader.java
new file mode 100644
index 00000000..c29b4454
--- /dev/null
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/CharacterReader.java
@@ -0,0 +1,483 @@
+package ru.noties.markwon.html.jsoup.parser;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Arrays;
+import java.util.Locale;
+
+import ru.noties.markwon.html.jsoup.UncheckedIOException;
+import ru.noties.markwon.html.jsoup.helper.Validate;
+
+/**
+ CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes.
+ */
+public final class CharacterReader {
+ static final char EOF = (char) -1;
+ private static final int maxStringCacheLen = 12;
+ static final int maxBufferLen = 1024 * 32; // visible for testing
+ private static final int readAheadLimit = (int) (maxBufferLen * 0.75);
+
+ private final char[] charBuf;
+ private final Reader reader;
+ private int bufLength;
+ private int bufSplitPoint;
+ private int bufPos;
+ private int readerPos;
+ private int bufMark;
+ private final String[] stringCache = new String[512]; // holds reused strings in this doc, to lessen garbage
+
+ public CharacterReader(Reader input, int sz) {
+ Validate.notNull(input);
+ Validate.isTrue(input.markSupported());
+ reader = input;
+ charBuf = new char[sz > maxBufferLen ? maxBufferLen : sz];
+ bufferUp();
+ }
+
+ public CharacterReader(Reader input) {
+ this(input, maxBufferLen);
+ }
+
+ public CharacterReader(String input) {
+ this(new StringReader(input), input.length());
+ }
+
+ private void bufferUp() {
+ if (bufPos < bufSplitPoint)
+ return;
+
+ try {
+ reader.skip(bufPos);
+ reader.mark(maxBufferLen);
+ final int read = reader.read(charBuf);
+ reader.reset();
+ if (read != -1) {
+ bufLength = read;
+ readerPos += bufPos;
+ bufPos = 0;
+ bufMark = 0;
+ bufSplitPoint = bufLength > readAheadLimit ? readAheadLimit : bufLength;
+ }
+ } catch (IOException e) {
+ throw new UncheckedIOException(e);
+ }
+ }
+
+ /**
+ * Gets the current cursor position in the content.
+ * @return current position
+ */
+ public int pos() {
+ return readerPos + bufPos;
+ }
+
+ /**
+ * Tests if all the content has been read.
+ * @return true if nothing left to read.
+ */
+ public boolean isEmpty() {
+ bufferUp();
+ return bufPos >= bufLength;
+ }
+
+ private boolean isEmptyNoBufferUp() {
+ return bufPos >= bufLength;
+ }
+
+ /**
+ * Get the char at the current position.
+ * @return char
+ */
+ public char current() {
+ bufferUp();
+ return isEmptyNoBufferUp() ? EOF : charBuf[bufPos];
+ }
+
+ char consume() {
+ bufferUp();
+ char val = isEmptyNoBufferUp() ? EOF : charBuf[bufPos];
+ bufPos++;
+ return val;
+ }
+
+ void unconsume() {
+ bufPos--;
+ }
+
+ /**
+ * Moves the current position by one.
+ */
+ public void advance() {
+ bufPos++;
+ }
+
+ void mark() {
+ bufMark = bufPos;
+ }
+
+ void rewindToMark() {
+ bufPos = bufMark;
+ }
+
+ /**
+ * Returns the number of characters between the current position and the next instance of the input char
+ * @param c scan target
+ * @return offset between current position and next instance of target. -1 if not found.
+ */
+ int nextIndexOf(char c) {
+ // doesn't handle scanning for surrogates
+ bufferUp();
+ for (int i = bufPos; i < bufLength; i++) {
+ if (c == charBuf[i])
+ return i - bufPos;
+ }
+ return -1;
+ }
+
+ /**
+ * Returns the number of characters between the current position and the next instance of the input sequence
+ *
+ * @param seq scan target
+ * @return offset between current position and next instance of target. -1 if not found.
+ */
+ int nextIndexOf(CharSequence seq) {
+ bufferUp();
+ // doesn't handle scanning for surrogates
+ char startChar = seq.charAt(0);
+ for (int offset = bufPos; offset < bufLength; offset++) {
+ // scan to first instance of startchar:
+ if (startChar != charBuf[offset])
+ while(++offset < bufLength && startChar != charBuf[offset]) { /* empty */ }
+ int i = offset + 1;
+ int last = i + seq.length()-1;
+ if (offset < bufLength && last <= bufLength) {
+ for (int j = 1; i < last && seq.charAt(j) == charBuf[i]; i++, j++) { /* empty */ }
+ if (i == last) // found full sequence
+ return offset - bufPos;
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * Reads characters up to the specific char.
+ * @param c the delimiter
+ * @return the chars read
+ */
+ public String consumeTo(char c) {
+ int offset = nextIndexOf(c);
+ if (offset != -1) {
+ String consumed = cacheString(charBuf, stringCache, bufPos, offset);
+ bufPos += offset;
+ return consumed;
+ } else {
+ return consumeToEnd();
+ }
+ }
+
+ String consumeTo(String seq) {
+ int offset = nextIndexOf(seq);
+ if (offset != -1) {
+ String consumed = cacheString(charBuf, stringCache, bufPos, offset);
+ bufPos += offset;
+ return consumed;
+ } else {
+ return consumeToEnd();
+ }
+ }
+
+ /**
+ * Read characters until the first of any delimiters is found.
+ * @param chars delimiters to scan for
+ * @return characters read up to the matched delimiter.
+ */
+ public String consumeToAny(final char... chars) {
+ bufferUp();
+ final int start = bufPos;
+ final int remaining = bufLength;
+ final char[] val = charBuf;
+
+ OUTER: while (bufPos < remaining) {
+ for (char c : chars) {
+ if (val[bufPos] == c)
+ break OUTER;
+ }
+ bufPos++;
+ }
+
+ return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
+ }
+
+ String consumeToAnySorted(final char... chars) {
+ bufferUp();
+ final int start = bufPos;
+ final int remaining = bufLength;
+ final char[] val = charBuf;
+
+ while (bufPos < remaining) {
+ if (Arrays.binarySearch(chars, val[bufPos]) >= 0)
+ break;
+ bufPos++;
+ }
+
+ return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
+ }
+
+ String consumeData() {
+ // &, <, null
+ bufferUp();
+ final int start = bufPos;
+ final int remaining = bufLength;
+ final char[] val = charBuf;
+
+ while (bufPos < remaining) {
+ final char c = val[bufPos];
+ if (c == '&'|| c == '<' || c == TokeniserState.nullChar)
+ break;
+ bufPos++;
+ }
+
+ return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
+ }
+
+ String consumeTagName() {
+ // '\t', '\n', '\r', '\f', ' ', '/', '>', nullChar
+ bufferUp();
+ final int start = bufPos;
+ final int remaining = bufLength;
+ final char[] val = charBuf;
+
+ while (bufPos < remaining) {
+ final char c = val[bufPos];
+ if (c == '\t'|| c == '\n'|| c == '\r'|| c == '\f'|| c == ' '|| c == '/'|| c == '>'|| c == TokeniserState.nullChar)
+ break;
+ bufPos++;
+ }
+
+ return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
+ }
+
+ String consumeToEnd() {
+ bufferUp();
+ String data = cacheString(charBuf, stringCache, bufPos, bufLength - bufPos);
+ bufPos = bufLength;
+ return data;
+ }
+
+ String consumeLetterSequence() {
+ bufferUp();
+ int start = bufPos;
+ while (bufPos < bufLength) {
+ char c = charBuf[bufPos];
+ if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c))
+ bufPos++;
+ else
+ break;
+ }
+
+ return cacheString(charBuf, stringCache, start, bufPos - start);
+ }
+
+ String consumeLetterThenDigitSequence() {
+ bufferUp();
+ int start = bufPos;
+ while (bufPos < bufLength) {
+ char c = charBuf[bufPos];
+ if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c))
+ bufPos++;
+ else
+ break;
+ }
+ while (!isEmptyNoBufferUp()) {
+ char c = charBuf[bufPos];
+ if (c >= '0' && c <= '9')
+ bufPos++;
+ else
+ break;
+ }
+
+ return cacheString(charBuf, stringCache, start, bufPos - start);
+ }
+
+ String consumeHexSequence() {
+ bufferUp();
+ int start = bufPos;
+ while (bufPos < bufLength) {
+ char c = charBuf[bufPos];
+ if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'))
+ bufPos++;
+ else
+ break;
+ }
+ return cacheString(charBuf, stringCache, start, bufPos - start);
+ }
+
+ String consumeDigitSequence() {
+ bufferUp();
+ int start = bufPos;
+ while (bufPos < bufLength) {
+ char c = charBuf[bufPos];
+ if (c >= '0' && c <= '9')
+ bufPos++;
+ else
+ break;
+ }
+ return cacheString(charBuf, stringCache, start, bufPos - start);
+ }
+
+ boolean matches(char c) {
+ return !isEmpty() && charBuf[bufPos] == c;
+
+ }
+
+ boolean matches(String seq) {
+ bufferUp();
+ int scanLength = seq.length();
+ if (scanLength > bufLength - bufPos)
+ return false;
+
+ for (int offset = 0; offset < scanLength; offset++)
+ if (seq.charAt(offset) != charBuf[bufPos +offset])
+ return false;
+ return true;
+ }
+
+ boolean matchesIgnoreCase(String seq) {
+ bufferUp();
+ int scanLength = seq.length();
+ if (scanLength > bufLength - bufPos)
+ return false;
+
+ for (int offset = 0; offset < scanLength; offset++) {
+ char upScan = Character.toUpperCase(seq.charAt(offset));
+ char upTarget = Character.toUpperCase(charBuf[bufPos + offset]);
+ if (upScan != upTarget)
+ return false;
+ }
+ return true;
+ }
+
+ boolean matchesAny(char... seq) {
+ if (isEmpty())
+ return false;
+
+ bufferUp();
+ char c = charBuf[bufPos];
+ for (char seek : seq) {
+ if (seek == c)
+ return true;
+ }
+ return false;
+ }
+
+ boolean matchesAnySorted(char[] seq) {
+ bufferUp();
+ return !isEmpty() && Arrays.binarySearch(seq, charBuf[bufPos]) >= 0;
+ }
+
+ boolean matchesLetter() {
+ if (isEmpty())
+ return false;
+ char c = charBuf[bufPos];
+ return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c);
+ }
+
+ boolean matchesDigit() {
+ if (isEmpty())
+ return false;
+ char c = charBuf[bufPos];
+ return (c >= '0' && c <= '9');
+ }
+
+ boolean matchConsume(String seq) {
+ bufferUp();
+ if (matches(seq)) {
+ bufPos += seq.length();
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ boolean matchConsumeIgnoreCase(String seq) {
+ if (matchesIgnoreCase(seq)) {
+ bufPos += seq.length();
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ boolean containsIgnoreCase(String seq) {
+ // used to check presence of , . only finds consistent case.
+ String loScan = seq.toLowerCase(Locale.ENGLISH);
+ String hiScan = seq.toUpperCase(Locale.ENGLISH);
+ return (nextIndexOf(loScan) > -1) || (nextIndexOf(hiScan) > -1);
+ }
+
+ @Override
+ public String toString() {
+ return new String(charBuf, bufPos, bufLength - bufPos);
+ }
+
+ /**
+ * Caches short strings, as a flywheel pattern, to reduce GC load. Just for this doc, to prevent leaks.
+ *
+ * Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list.
+ * That saves both having to create objects as hash keys, and running through the entry list, at the expense of
+ * some more duplicates.
+ */
+ private static String cacheString(final char[] charBuf, final String[] stringCache, final int start, final int count) {
+ // limit (no cache):
+ if (count > maxStringCacheLen)
+ return new String(charBuf, start, count);
+ if (count < 1)
+ return "";
+
+ // calculate hash:
+ int hash = 0;
+ int offset = start;
+ for (int i = 0; i < count; i++) {
+ hash = 31 * hash + charBuf[offset++];
+ }
+
+ // get from cache
+ final int index = hash & stringCache.length - 1;
+ String cached = stringCache[index];
+
+ if (cached == null) { // miss, add
+ cached = new String(charBuf, start, count);
+ stringCache[index] = cached;
+ } else { // hashcode hit, check equality
+ if (rangeEquals(charBuf, start, count, cached)) { // hit
+ return cached;
+ } else { // hashcode conflict
+ cached = new String(charBuf, start, count);
+ stringCache[index] = cached; // update the cache, as recently used strings are more likely to show up again
+ }
+ }
+ return cached;
+ }
+
+ /**
+ * Check if the value of the provided range equals the string.
+ */
+ static boolean rangeEquals(final char[] charBuf, final int start, int count, final String cached) {
+ if (count == cached.length()) {
+ int i = start;
+ int j = 0;
+ while (count-- != 0) {
+ if (charBuf[i++] != cached.charAt(j++))
+ return false;
+ }
+ return true;
+ }
+ return false;
+ }
+
+ // just used for testing
+ boolean rangeEquals(final int start, final int count, final String cached) {
+ return rangeEquals(charBuf, start, count, cached);
+ }
+}
diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/ParseError.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/ParseError.java
new file mode 100644
index 00000000..533f9aee
--- /dev/null
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/ParseError.java
@@ -0,0 +1,41 @@
+package ru.noties.markwon.html.jsoup.parser;
+
+/**
+ * A Parse Error records an error in the input HTML that occurs in either the tokenisation or the tree building phase.
+ */
+public class ParseError {
+ private int pos;
+ private String errorMsg;
+
+ ParseError(int pos, String errorMsg) {
+ this.pos = pos;
+ this.errorMsg = errorMsg;
+ }
+
+ ParseError(int pos, String errorFormat, Object... args) {
+ this.errorMsg = String.format(errorFormat, args);
+ this.pos = pos;
+ }
+
+ /**
+ * Retrieve the error message.
+ * @return the error message.
+ */
+ public String getErrorMessage() {
+ return errorMsg;
+ }
+
+ /**
+ * Retrieves the offset of the error.
+ * @return error offset within input
+ */
+ public int getPosition() {
+ return pos;
+ }
+
+ @Override
+ public String toString() {
+ return pos + ": " + errorMsg;
+ }
+}
+
diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/ParseErrorList.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/ParseErrorList.java
new file mode 100644
index 00000000..a3e42a08
--- /dev/null
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/ParseErrorList.java
@@ -0,0 +1,34 @@
+package ru.noties.markwon.html.jsoup.parser;
+
+import java.util.ArrayList;
+
+/**
+ * A container for ParseErrors.
+ *
+ * @author Jonathan Hedley
+ */
+public class ParseErrorList extends ArrayList{
+ private static final int INITIAL_CAPACITY = 16;
+ private final int maxSize;
+
+ ParseErrorList(int initialCapacity, int maxSize) {
+ super(initialCapacity);
+ this.maxSize = maxSize;
+ }
+
+ boolean canAddError() {
+ return size() < maxSize;
+ }
+
+ int getMaxSize() {
+ return maxSize;
+ }
+
+ public static ParseErrorList noTracking() {
+ return new ParseErrorList(0, 0);
+ }
+
+ public static ParseErrorList tracking(int maxSize) {
+ return new ParseErrorList(INITIAL_CAPACITY, maxSize);
+ }
+}
diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/Token.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/Token.java
new file mode 100644
index 00000000..0b157d07
--- /dev/null
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/Token.java
@@ -0,0 +1,398 @@
+package ru.noties.markwon.html.jsoup.parser;
+
+import android.support.annotation.NonNull;
+
+import ru.noties.markwon.html.jsoup.helper.Validate;
+import ru.noties.markwon.html.jsoup.nodes.Attributes;
+
+import static ru.noties.markwon.html.jsoup.helper.Normalizer.lowerCase;
+
+/**
+ * Parse tokens for the Tokeniser.
+ */
+public abstract class Token {
+
+ public final TokenType type;
+
+ protected Token(@NonNull TokenType tokenType) {
+ this.type = tokenType;
+ }
+
+// String tokenType() {
+// return this.getClass().getSimpleName();
+// }
+
+ /**
+ * Reset the data represent by this token, for reuse. Prevents the need to create transfer objects for every
+ * piece of data, which immediately get GCed.
+ */
+ public abstract Token reset();
+
+ static void reset(StringBuilder sb) {
+ if (sb != null) {
+ sb.delete(0, sb.length());
+ }
+ }
+
+ public static final class Doctype extends Token {
+ final StringBuilder name = new StringBuilder();
+ String pubSysKey = null;
+ final StringBuilder publicIdentifier = new StringBuilder();
+ final StringBuilder systemIdentifier = new StringBuilder();
+ boolean forceQuirks = false;
+
+ Doctype() {
+ super(TokenType.Doctype);
+ }
+
+ @Override
+ public Token reset() {
+ reset(name);
+ pubSysKey = null;
+ reset(publicIdentifier);
+ reset(systemIdentifier);
+ forceQuirks = false;
+ return this;
+ }
+
+ String getName() {
+ return name.toString();
+ }
+
+ String getPubSysKey() {
+ return pubSysKey;
+ }
+
+ String getPublicIdentifier() {
+ return publicIdentifier.toString();
+ }
+
+ public String getSystemIdentifier() {
+ return systemIdentifier.toString();
+ }
+
+ public boolean isForceQuirks() {
+ return forceQuirks;
+ }
+ }
+
+ public static abstract class Tag extends Token {
+
+ public String tagName;
+ public String normalName; // lc version of tag name, for case insensitive tree build
+ private String pendingAttributeName; // attribute names are generally caught in one hop, not accumulated
+ private StringBuilder pendingAttributeValue = new StringBuilder(); // but values are accumulated, from e.g. & in hrefs
+ private String pendingAttributeValueS; // try to get attr vals in one shot, vs Builder
+ private boolean hasEmptyAttributeValue = false; // distinguish boolean attribute from empty string value
+ private boolean hasPendingAttributeValue = false;
+ public boolean selfClosing = false;
+ public Attributes attributes; // start tags get attributes on construction. End tags get attributes on first new attribute (but only for parser convenience, not used).
+
+ protected Tag(@NonNull TokenType tokenType) {
+ super(tokenType);
+ }
+
+ @Override
+ public Tag reset() {
+ tagName = null;
+ normalName = null;
+ pendingAttributeName = null;
+ reset(pendingAttributeValue);
+ pendingAttributeValueS = null;
+ hasEmptyAttributeValue = false;
+ hasPendingAttributeValue = false;
+ selfClosing = false;
+ attributes = null;
+ return this;
+ }
+
+ final void newAttribute() {
+ if (attributes == null)
+ attributes = new Attributes();
+
+ if (pendingAttributeName != null) {
+ // the tokeniser has skipped whitespace control chars, but trimming could collapse to empty for other control codes, so verify here
+ pendingAttributeName = pendingAttributeName.trim();
+ if (pendingAttributeName.length() > 0) {
+ String value;
+ if (hasPendingAttributeValue)
+ value = pendingAttributeValue.length() > 0 ? pendingAttributeValue.toString() : pendingAttributeValueS;
+ else if (hasEmptyAttributeValue)
+ value = "";
+ else
+ value = null;
+ attributes.put(pendingAttributeName, value);
+ }
+ }
+ pendingAttributeName = null;
+ hasEmptyAttributeValue = false;
+ hasPendingAttributeValue = false;
+ reset(pendingAttributeValue);
+ pendingAttributeValueS = null;
+ }
+
+ final void finaliseTag() {
+ // finalises for emit
+ if (pendingAttributeName != null) {
+ // todo: check if attribute name exists; if so, drop and error
+ newAttribute();
+ }
+ }
+
+ final String name() { // preserves case, for input into Tag.valueOf (which may drop case)
+ Validate.isFalse(tagName == null || tagName.length() == 0);
+ return tagName;
+ }
+
+ final String normalName() { // loses case, used in tree building for working out where in tree it should go
+ return normalName;
+ }
+
+ final Tag name(String name) {
+ tagName = name;
+ normalName = lowerCase(name);
+ return this;
+ }
+
+ final boolean isSelfClosing() {
+ return selfClosing;
+ }
+
+ @SuppressWarnings({"TypeMayBeWeakened"})
+ final Attributes getAttributes() {
+ return attributes;
+ }
+
+ // these appenders are rarely hit in not null state-- caused by null chars.
+ final void appendTagName(String append) {
+ tagName = tagName == null ? append : tagName.concat(append);
+ normalName = lowerCase(tagName);
+ }
+
+ final void appendTagName(char append) {
+ appendTagName(String.valueOf(append));
+ }
+
+ final void appendAttributeName(String append) {
+ pendingAttributeName = pendingAttributeName == null ? append : pendingAttributeName.concat(append);
+ }
+
+ final void appendAttributeName(char append) {
+ appendAttributeName(String.valueOf(append));
+ }
+
+ final void appendAttributeValue(String append) {
+ ensureAttributeValue();
+ if (pendingAttributeValue.length() == 0) {
+ pendingAttributeValueS = append;
+ } else {
+ pendingAttributeValue.append(append);
+ }
+ }
+
+ final void appendAttributeValue(char append) {
+ ensureAttributeValue();
+ pendingAttributeValue.append(append);
+ }
+
+ final void appendAttributeValue(char[] append) {
+ ensureAttributeValue();
+ pendingAttributeValue.append(append);
+ }
+
+ final void appendAttributeValue(int[] appendCodepoints) {
+ ensureAttributeValue();
+ for (int codepoint : appendCodepoints) {
+ pendingAttributeValue.appendCodePoint(codepoint);
+ }
+ }
+
+ final void setEmptyAttributeValue() {
+ hasEmptyAttributeValue = true;
+ }
+
+ private void ensureAttributeValue() {
+ hasPendingAttributeValue = true;
+ // if on second hit, we'll need to move to the builder
+ if (pendingAttributeValueS != null) {
+ pendingAttributeValue.append(pendingAttributeValueS);
+ pendingAttributeValueS = null;
+ }
+ }
+ }
+
+ public final static class StartTag extends Tag {
+ StartTag() {
+ super(TokenType.StartTag);
+ attributes = new Attributes();
+ }
+
+ @Override
+ public Tag reset() {
+ super.reset();
+ attributes = new Attributes();
+ // todo - would prefer these to be null, but need to check Element assertions
+ return this;
+ }
+
+ StartTag nameAttr(String name, Attributes attributes) {
+ this.tagName = name;
+ this.attributes = attributes;
+ normalName = lowerCase(tagName);
+ return this;
+ }
+
+ @Override
+ public String toString() {
+ if (attributes != null && attributes.size() > 0)
+ return "<" + name() + " " + attributes.toString() + ">";
+ else
+ return "<" + name() + ">";
+ }
+ }
+
+ public final static class EndTag extends Tag{
+ EndTag() {
+ super(TokenType.EndTag);
+ }
+
+ @Override
+ public String toString() {
+ return "" + name() + ">";
+ }
+ }
+
+ public final static class Comment extends Token {
+ final StringBuilder data = new StringBuilder();
+ boolean bogus = false;
+
+ @Override
+ public Token reset() {
+ reset(data);
+ bogus = false;
+ return this;
+ }
+
+ Comment() {
+ super(TokenType.Comment);
+ }
+
+ String getData() {
+ return data.toString();
+ }
+
+ @Override
+ public String toString() {
+ return "";
+ }
+ }
+
+ public static class Character extends Token {
+ private String data;
+
+ Character() {
+ super(TokenType.Character);
+ }
+
+ @Override
+ public Token reset() {
+ data = null;
+ return this;
+ }
+
+ Character data(String data) {
+ this.data = data;
+ return this;
+ }
+
+ public String getData() {
+ return data;
+ }
+
+ @Override
+ public String toString() {
+ return getData();
+ }
+ }
+
+ public final static class CData extends Character {
+ CData(String data) {
+ super();
+ this.data(data);
+ }
+
+ @Override
+ public String toString() {
+ return "";
+ }
+
+ }
+
+ public final static class EOF extends Token {
+ EOF() {
+ super(Token.TokenType.EOF);
+ }
+
+ @Override
+ public Token reset() {
+ return this;
+ }
+ }
+
+// final boolean isDoctype() {
+// return type == TokenType.Doctype;
+// }
+//
+// final Doctype asDoctype() {
+// return (Doctype) this;
+// }
+//
+// final boolean isStartTag() {
+// return type == TokenType.StartTag;
+// }
+//
+// final StartTag asStartTag() {
+// return (StartTag) this;
+// }
+//
+// final boolean isEndTag() {
+// return type == TokenType.EndTag;
+// }
+//
+// final EndTag asEndTag() {
+// return (EndTag) this;
+// }
+//
+// final boolean isComment() {
+// return type == TokenType.Comment;
+// }
+//
+// final Comment asComment() {
+// return (Comment) this;
+// }
+//
+// final boolean isCharacter() {
+// return type == TokenType.Character;
+// }
+//
+// final boolean isCData() {
+// return this instanceof CData;
+// }
+//
+// final Character asCharacter() {
+// return (Character) this;
+// }
+//
+// final boolean isEOF() {
+// return type == TokenType.EOF;
+// }
+
+ public enum TokenType {
+ Doctype,
+ StartTag,
+ EndTag,
+ Comment,
+ Character, // note no CData - treated in builder as an extension of Character
+ EOF
+ }
+}
diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/Tokeniser.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/Tokeniser.java
new file mode 100644
index 00000000..3d5284bd
--- /dev/null
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/Tokeniser.java
@@ -0,0 +1,295 @@
+package ru.noties.markwon.html.jsoup.parser;
+
+import java.util.Arrays;
+
+import ru.noties.markwon.html.jsoup.helper.Validate;
+import ru.noties.markwon.html.jsoup.nodes.Entities;
+
+/**
+ * Readers the input stream into tokens.
+ */
+public final class Tokeniser {
+ static final char replacementChar = '\uFFFD'; // replaces null character
+ private static final char[] notCharRefCharsSorted = new char[]{'\t', '\n', '\r', '\f', ' ', '<', '&'};
+
+ // Some illegal character escapes are parsed by browsers as windows-1252 instead. See issue #1034
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
+ static final int win1252ExtensionsStart = 0x80;
+ static final int[] win1252Extensions = new int[] {
+ // we could build this manually, but Windows-1252 is not a standard java charset so that could break on
+ // some platforms - this table is verified with a test
+ 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
+ 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
+ 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+ 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178,
+ };
+
+ static {
+ Arrays.sort(notCharRefCharsSorted);
+ }
+
+ private final CharacterReader reader; // html input
+ private final ParseErrorList errors; // errors found while tokenising
+
+ private TokeniserState state = TokeniserState.Data; // current tokenisation state
+ private Token emitPending; // the token we are about to emit on next read
+ private boolean isEmitPending = false;
+ private String charsString = null; // characters pending an emit. Will fall to charsBuilder if more than one
+ private StringBuilder charsBuilder = new StringBuilder(1024); // buffers characters to output as one token, if more than one emit per read
+ StringBuilder dataBuffer = new StringBuilder(1024); // buffers data looking for
+
+ Token.Tag tagPending; // tag we are building up
+ Token.StartTag startPending = new Token.StartTag();
+ Token.EndTag endPending = new Token.EndTag();
+ Token.Character charPending = new Token.Character();
+ Token.Doctype doctypePending = new Token.Doctype(); // doctype building up
+ Token.Comment commentPending = new Token.Comment(); // comment building up
+ private String lastStartTag; // the last start tag emitted, to test appropriate end tag
+
+ public Tokeniser(CharacterReader reader, ParseErrorList errors) {
+ this.reader = reader;
+ this.errors = errors;
+ }
+
+ public Token read() {
+ while (!isEmitPending)
+ state.read(this, reader);
+
+ // if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read:
+ if (charsBuilder.length() > 0) {
+ String str = charsBuilder.toString();
+ charsBuilder.delete(0, charsBuilder.length());
+ charsString = null;
+ return charPending.data(str);
+ } else if (charsString != null) {
+ Token token = charPending.data(charsString);
+ charsString = null;
+ return token;
+ } else {
+ isEmitPending = false;
+ return emitPending;
+ }
+ }
+
+ void emit(Token token) {
+ Validate.isFalse(isEmitPending, "There is an unread token pending!");
+
+ emitPending = token;
+ isEmitPending = true;
+
+ if (token.type == Token.TokenType.StartTag) {
+ Token.StartTag startTag = (Token.StartTag) token;
+ lastStartTag = startTag.tagName;
+ } else if (token.type == Token.TokenType.EndTag) {
+ Token.EndTag endTag = (Token.EndTag) token;
+ if (endTag.attributes != null)
+ error("Attributes incorrectly present on end tag");
+ }
+ }
+
+ void emit(final String str) {
+ // buffer strings up until last string token found, to emit only one token for a run of character refs etc.
+ // does not set isEmitPending; read checks that
+ if (charsString == null) {
+ charsString = str;
+ }
+ else {
+ if (charsBuilder.length() == 0) { // switching to string builder as more than one emit before read
+ charsBuilder.append(charsString);
+ }
+ charsBuilder.append(str);
+ }
+ }
+
+ void emit(char[] chars) {
+ emit(String.valueOf(chars));
+ }
+
+ void emit(int[] codepoints) {
+ emit(new String(codepoints, 0, codepoints.length));
+ }
+
+ void emit(char c) {
+ emit(String.valueOf(c));
+ }
+
+ TokeniserState getState() {
+ return state;
+ }
+
+ void transition(TokeniserState state) {
+ this.state = state;
+ }
+
+ void advanceTransition(TokeniserState state) {
+ reader.advance();
+ this.state = state;
+ }
+
+ final private int[] codepointHolder = new int[1]; // holder to not have to keep creating arrays
+ final private int[] multipointHolder = new int[2];
+ int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
+ if (reader.isEmpty())
+ return null;
+ if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
+ return null;
+ if (reader.matchesAnySorted(notCharRefCharsSorted))
+ return null;
+
+ final int[] codeRef = codepointHolder;
+ reader.mark();
+ if (reader.matchConsume("#")) { // numbered
+ boolean isHexMode = reader.matchConsumeIgnoreCase("X");
+ String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
+ if (numRef.length() == 0) { // didn't match anything
+ characterReferenceError("numeric reference with no numerals");
+ reader.rewindToMark();
+ return null;
+ }
+ if (!reader.matchConsume(";"))
+ characterReferenceError("missing semicolon"); // missing semi
+ int charval = -1;
+ try {
+ int base = isHexMode ? 16 : 10;
+ charval = Integer.valueOf(numRef, base);
+ } catch (NumberFormatException ignored) {
+ } // skip
+ if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
+ characterReferenceError("character outside of valid range");
+ codeRef[0] = replacementChar;
+ return codeRef;
+ } else {
+ // fix illegal unicode characters to match browser behavior
+ if (charval >= win1252ExtensionsStart && charval < win1252ExtensionsStart + win1252Extensions.length) {
+ characterReferenceError("character is not a valid unicode code point");
+ charval = win1252Extensions[charval - win1252ExtensionsStart];
+ }
+
+ // todo: implement number replacement table
+ // todo: check for extra illegal unicode points as parse errors
+ codeRef[0] = charval;
+ return codeRef;
+ }
+ } else { // named
+ // get as many letters as possible, and look for matching entities.
+ String nameRef = reader.consumeLetterThenDigitSequence();
+ boolean looksLegit = reader.matches(';');
+ // found if a base named entity without a ;, or an extended entity with the ;.
+ boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));
+
+ if (!found) {
+ reader.rewindToMark();
+ if (looksLegit) // named with semicolon
+ characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
+ return null;
+ }
+ if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
+ // don't want that to match
+ reader.rewindToMark();
+ return null;
+ }
+ if (!reader.matchConsume(";"))
+ characterReferenceError("missing semicolon"); // missing semi
+ int numChars = Entities.codepointsForName(nameRef, multipointHolder);
+ if (numChars == 1) {
+ codeRef[0] = multipointHolder[0];
+ return codeRef;
+ } else if (numChars ==2) {
+ return multipointHolder;
+ } else {
+ Validate.fail("Unexpected characters returned for " + nameRef);
+ return multipointHolder;
+ }
+ }
+ }
+
+ Token.Tag createTagPending(boolean start) {
+ tagPending = start ? startPending.reset() : endPending.reset();
+ return tagPending;
+ }
+
+ void emitTagPending() {
+ tagPending.finaliseTag();
+ emit(tagPending);
+ }
+
+ void createCommentPending() {
+ commentPending.reset();
+ }
+
+ void emitCommentPending() {
+ emit(commentPending);
+ }
+
+ void createDoctypePending() {
+ doctypePending.reset();
+ }
+
+ void emitDoctypePending() {
+ emit(doctypePending);
+ }
+
+ void createTempBuffer() {
+ Token.reset(dataBuffer);
+ }
+
+ boolean isAppropriateEndTagToken() {
+ return lastStartTag != null && tagPending.name().equalsIgnoreCase(lastStartTag);
+ }
+
+ String appropriateEndTagName() {
+ return lastStartTag; // could be null
+ }
+
+ void error(TokeniserState state) {
+ if (errors.canAddError())
+ errors.add(new ParseError(reader.pos(), "Unexpected character '%s' in input state [%s]", reader.current(), state));
+ }
+
+ void eofError(TokeniserState state) {
+ if (errors.canAddError())
+ errors.add(new ParseError(reader.pos(), "Unexpectedly reached end of file (EOF) in input state [%s]", state));
+ }
+
+ private void characterReferenceError(String message) {
+ if (errors.canAddError())
+ errors.add(new ParseError(reader.pos(), "Invalid character reference: %s", message));
+ }
+
+ void error(String errorMsg) {
+ if (errors.canAddError())
+ errors.add(new ParseError(reader.pos(), errorMsg));
+ }
+
+ boolean currentNodeInHtmlNS() {
+ // todo: implement namespaces correctly
+ return true;
+ // Element currentNode = currentNode();
+ // return currentNode != null && currentNode.namespace().equals("HTML");
+ }
+
+// /**
+// * Utility method to consume reader and unescape entities found within.
+// * @param inAttribute if the text to be unescaped is in an attribute
+// * @return unescaped string from reader
+// */
+// String unescapeEntities(boolean inAttribute) {
+// StringBuilder builder = StringUtil.stringBuilder();
+// while (!reader.isEmpty()) {
+// builder.append(reader.consumeTo('&'));
+// if (reader.matches('&')) {
+// reader.consume();
+// int[] c = consumeCharacterReference(null, inAttribute);
+// if (c == null || c.length==0)
+// builder.append('&');
+// else {
+// builder.appendCodePoint(c[0]);
+// if (c.length == 2)
+// builder.appendCodePoint(c[1]);
+// }
+//
+// }
+// }
+// return builder.toString();
+// }
+}
diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/TokeniserState.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/TokeniserState.java
new file mode 100644
index 00000000..01a98958
--- /dev/null
+++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/TokeniserState.java
@@ -0,0 +1,1737 @@
+package ru.noties.markwon.html.jsoup.parser;
+
+import ru.noties.markwon.html.jsoup.nodes.DocumentType;
+
+/**
+ * States and transition activations for the Tokeniser.
+ */
+enum TokeniserState {
+ Data {
+ // in data state, gather characters until a character reference or tag is found
+ void read(Tokeniser t, CharacterReader r) {
+ switch (r.current()) {
+ case '&':
+ t.advanceTransition(CharacterReferenceInData);
+ break;
+ case '<':
+ t.advanceTransition(TagOpen);
+ break;
+ case nullChar:
+ t.error(this); // NOT replacement character (oddly?)
+ t.emit(r.consume());
+ break;
+ case eof:
+ t.emit(new Token.EOF());
+ break;
+ default:
+ String data = r.consumeData();
+ t.emit(data);
+ break;
+ }
+ }
+ },
+ CharacterReferenceInData {
+ // from & in data
+ void read(Tokeniser t, CharacterReader r) {
+ readCharRef(t, Data);
+ }
+ },
+ Rcdata {
+ /// handles data in title, textarea etc
+ void read(Tokeniser t, CharacterReader r) {
+ switch (r.current()) {
+ case '&':
+ t.advanceTransition(CharacterReferenceInRcdata);
+ break;
+ case '<':
+ t.advanceTransition(RcdataLessthanSign);
+ break;
+ case nullChar:
+ t.error(this);
+ r.advance();
+ t.emit(replacementChar);
+ break;
+ case eof:
+ t.emit(new Token.EOF());
+ break;
+ default:
+ String data = r.consumeToAny('&', '<', nullChar);
+ t.emit(data);
+ break;
+ }
+ }
+ },
+ CharacterReferenceInRcdata {
+ void read(Tokeniser t, CharacterReader r) {
+ readCharRef(t, Rcdata);
+ }
+ },
+ Rawtext {
+ void read(Tokeniser t, CharacterReader r) {
+ readData(t, r, this, RawtextLessthanSign);
+ }
+ },
+ ScriptData {
+ void read(Tokeniser t, CharacterReader r) {
+ readData(t, r, this, ScriptDataLessthanSign);
+ }
+ },
+ PLAINTEXT {
+ void read(Tokeniser t, CharacterReader r) {
+ switch (r.current()) {
+ case nullChar:
+ t.error(this);
+ r.advance();
+ t.emit(replacementChar);
+ break;
+ case eof:
+ t.emit(new Token.EOF());
+ break;
+ default:
+ String data = r.consumeTo(nullChar);
+ t.emit(data);
+ break;
+ }
+ }
+ },
+ TagOpen {
+ // from < in data
+ void read(Tokeniser t, CharacterReader r) {
+ switch (r.current()) {
+ case '!':
+ t.advanceTransition(MarkupDeclarationOpen);
+ break;
+ case '/':
+ t.advanceTransition(EndTagOpen);
+ break;
+ case '?':
+ t.advanceTransition(BogusComment);
+ break;
+ default:
+ if (r.matchesLetter()) {
+ t.createTagPending(true);
+ t.transition(TagName);
+ } else {
+ t.error(this);
+ t.emit('<'); // char that got us here
+ t.transition(Data);
+ }
+ break;
+ }
+ }
+ },
+ EndTagOpen {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.isEmpty()) {
+ t.eofError(this);
+ t.emit("");
+ t.transition(Data);
+ } else if (r.matchesLetter()) {
+ t.createTagPending(false);
+ t.transition(TagName);
+ } else if (r.matches('>')) {
+ t.error(this);
+ t.advanceTransition(Data);
+ } else {
+ t.error(this);
+ t.advanceTransition(BogusComment);
+ }
+ }
+ },
+ TagName {
+ // from < or in data, will have start or end tag pending
+ void read(Tokeniser t, CharacterReader r) {
+ // previous TagOpen state did NOT consume, will have a letter char in current
+ //String tagName = r.consumeToAnySorted(tagCharsSorted).toLowerCase();
+ String tagName = r.consumeTagName();
+ t.tagPending.appendTagName(tagName);
+
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ case ' ':
+ t.transition(BeforeAttributeName);
+ break;
+ case '/':
+ t.transition(SelfClosingStartTag);
+ break;
+ case '>':
+ t.emitTagPending();
+ t.transition(Data);
+ break;
+ case nullChar: // replacement
+ t.tagPending.appendTagName(replacementStr);
+ break;
+ case eof: // should emit pending tag?
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ default: // buffer underrun
+ t.tagPending.appendTagName(c);
+ }
+ }
+ },
+ RcdataLessthanSign {
+ // from < in rcdata
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matches('/')) {
+ t.createTempBuffer();
+ t.advanceTransition(RCDATAEndTagOpen);
+ } else if (r.matchesLetter() && t.appropriateEndTagName() != null && !r.containsIgnoreCase("" + t.appropriateEndTagName())) {
+ // diverge from spec: got a start tag, but there's no appropriate end tag (), so rather than
+ // consuming to EOF; break out here
+ t.tagPending = t.createTagPending(false).name(t.appropriateEndTagName());
+ t.emitTagPending();
+ r.unconsume(); // undo "<"
+ t.transition(Data);
+ } else {
+ t.emit("<");
+ t.transition(Rcdata);
+ }
+ }
+ },
+ RCDATAEndTagOpen {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matchesLetter()) {
+ t.createTagPending(false);
+ t.tagPending.appendTagName(r.current());
+ t.dataBuffer.append(r.current());
+ t.advanceTransition(RCDATAEndTagName);
+ } else {
+ t.emit("");
+ t.transition(Rcdata);
+ }
+ }
+ },
+ RCDATAEndTagName {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matchesLetter()) {
+ String name = r.consumeLetterSequence();
+ t.tagPending.appendTagName(name);
+ t.dataBuffer.append(name);
+ return;
+ }
+
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ case ' ':
+ if (t.isAppropriateEndTagToken())
+ t.transition(BeforeAttributeName);
+ else
+ anythingElse(t, r);
+ break;
+ case '/':
+ if (t.isAppropriateEndTagToken())
+ t.transition(SelfClosingStartTag);
+ else
+ anythingElse(t, r);
+ break;
+ case '>':
+ if (t.isAppropriateEndTagToken()) {
+ t.emitTagPending();
+ t.transition(Data);
+ }
+ else
+ anythingElse(t, r);
+ break;
+ default:
+ anythingElse(t, r);
+ }
+ }
+
+ private void anythingElse(Tokeniser t, CharacterReader r) {
+ t.emit("" + t.dataBuffer.toString());
+ r.unconsume();
+ t.transition(Rcdata);
+ }
+ },
+ RawtextLessthanSign {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matches('/')) {
+ t.createTempBuffer();
+ t.advanceTransition(RawtextEndTagOpen);
+ } else {
+ t.emit('<');
+ t.transition(Rawtext);
+ }
+ }
+ },
+ RawtextEndTagOpen {
+ void read(Tokeniser t, CharacterReader r) {
+ readEndTag(t, r, RawtextEndTagName, Rawtext);
+ }
+ },
+ RawtextEndTagName {
+ void read(Tokeniser t, CharacterReader r) {
+ handleDataEndTag(t, r, Rawtext);
+ }
+ },
+ ScriptDataLessthanSign {
+ void read(Tokeniser t, CharacterReader r) {
+ switch (r.consume()) {
+ case '/':
+ t.createTempBuffer();
+ t.transition(ScriptDataEndTagOpen);
+ break;
+ case '!':
+ t.emit("':
+ t.emit(c);
+ t.transition(ScriptData);
+ break;
+ case nullChar:
+ t.error(this);
+ t.emit(replacementChar);
+ t.transition(ScriptDataEscaped);
+ break;
+ default:
+ t.emit(c);
+ t.transition(ScriptDataEscaped);
+ }
+ }
+ },
+ ScriptDataEscapedLessthanSign {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matchesLetter()) {
+ t.createTempBuffer();
+ t.dataBuffer.append(r.current());
+ t.emit("<" + r.current());
+ t.advanceTransition(ScriptDataDoubleEscapeStart);
+ } else if (r.matches('/')) {
+ t.createTempBuffer();
+ t.advanceTransition(ScriptDataEscapedEndTagOpen);
+ } else {
+ t.emit('<');
+ t.transition(ScriptDataEscaped);
+ }
+ }
+ },
+ ScriptDataEscapedEndTagOpen {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matchesLetter()) {
+ t.createTagPending(false);
+ t.tagPending.appendTagName(r.current());
+ t.dataBuffer.append(r.current());
+ t.advanceTransition(ScriptDataEscapedEndTagName);
+ } else {
+ t.emit("");
+ t.transition(ScriptDataEscaped);
+ }
+ }
+ },
+ ScriptDataEscapedEndTagName {
+ void read(Tokeniser t, CharacterReader r) {
+ handleDataEndTag(t, r, ScriptDataEscaped);
+ }
+ },
+ ScriptDataDoubleEscapeStart {
+ void read(Tokeniser t, CharacterReader r) {
+ handleDataDoubleEscapeTag(t, r, ScriptDataDoubleEscaped, ScriptDataEscaped);
+ }
+ },
+ ScriptDataDoubleEscaped {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.current();
+ switch (c) {
+ case '-':
+ t.emit(c);
+ t.advanceTransition(ScriptDataDoubleEscapedDash);
+ break;
+ case '<':
+ t.emit(c);
+ t.advanceTransition(ScriptDataDoubleEscapedLessthanSign);
+ break;
+ case nullChar:
+ t.error(this);
+ r.advance();
+ t.emit(replacementChar);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ default:
+ String data = r.consumeToAny('-', '<', nullChar);
+ t.emit(data);
+ }
+ }
+ },
+ ScriptDataDoubleEscapedDash {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '-':
+ t.emit(c);
+ t.transition(ScriptDataDoubleEscapedDashDash);
+ break;
+ case '<':
+ t.emit(c);
+ t.transition(ScriptDataDoubleEscapedLessthanSign);
+ break;
+ case nullChar:
+ t.error(this);
+ t.emit(replacementChar);
+ t.transition(ScriptDataDoubleEscaped);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ default:
+ t.emit(c);
+ t.transition(ScriptDataDoubleEscaped);
+ }
+ }
+ },
+ ScriptDataDoubleEscapedDashDash {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '-':
+ t.emit(c);
+ break;
+ case '<':
+ t.emit(c);
+ t.transition(ScriptDataDoubleEscapedLessthanSign);
+ break;
+ case '>':
+ t.emit(c);
+ t.transition(ScriptData);
+ break;
+ case nullChar:
+ t.error(this);
+ t.emit(replacementChar);
+ t.transition(ScriptDataDoubleEscaped);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ default:
+ t.emit(c);
+ t.transition(ScriptDataDoubleEscaped);
+ }
+ }
+ },
+ ScriptDataDoubleEscapedLessthanSign {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matches('/')) {
+ t.emit('/');
+ t.createTempBuffer();
+ t.advanceTransition(ScriptDataDoubleEscapeEnd);
+ } else {
+ t.transition(ScriptDataDoubleEscaped);
+ }
+ }
+ },
+ ScriptDataDoubleEscapeEnd {
+ void read(Tokeniser t, CharacterReader r) {
+ handleDataDoubleEscapeTag(t,r, ScriptDataEscaped, ScriptDataDoubleEscaped);
+ }
+ },
+ BeforeAttributeName {
+ // from tagname ':
+ t.emitTagPending();
+ t.transition(Data);
+ break;
+ case nullChar:
+ t.error(this);
+ t.tagPending.newAttribute();
+ r.unconsume();
+ t.transition(AttributeName);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ case '"':
+ case '\'':
+ case '<':
+ case '=':
+ t.error(this);
+ t.tagPending.newAttribute();
+ t.tagPending.appendAttributeName(c);
+ t.transition(AttributeName);
+ break;
+ default: // A-Z, anything else
+ t.tagPending.newAttribute();
+ r.unconsume();
+ t.transition(AttributeName);
+ }
+ }
+ },
+ AttributeName {
+ // from before attribute name
+ void read(Tokeniser t, CharacterReader r) {
+ String name = r.consumeToAnySorted(attributeNameCharsSorted);
+ t.tagPending.appendAttributeName(name);
+
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ case ' ':
+ t.transition(AfterAttributeName);
+ break;
+ case '/':
+ t.transition(SelfClosingStartTag);
+ break;
+ case '=':
+ t.transition(BeforeAttributeValue);
+ break;
+ case '>':
+ t.emitTagPending();
+ t.transition(Data);
+ break;
+ case nullChar:
+ t.error(this);
+ t.tagPending.appendAttributeName(replacementChar);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ case '"':
+ case '\'':
+ case '<':
+ t.error(this);
+ t.tagPending.appendAttributeName(c);
+ break;
+ default: // buffer underrun
+ t.tagPending.appendAttributeName(c);
+ }
+ }
+ },
+ AfterAttributeName {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ case ' ':
+ // ignore
+ break;
+ case '/':
+ t.transition(SelfClosingStartTag);
+ break;
+ case '=':
+ t.transition(BeforeAttributeValue);
+ break;
+ case '>':
+ t.emitTagPending();
+ t.transition(Data);
+ break;
+ case nullChar:
+ t.error(this);
+ t.tagPending.appendAttributeName(replacementChar);
+ t.transition(AttributeName);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ case '"':
+ case '\'':
+ case '<':
+ t.error(this);
+ t.tagPending.newAttribute();
+ t.tagPending.appendAttributeName(c);
+ t.transition(AttributeName);
+ break;
+ default: // A-Z, anything else
+ t.tagPending.newAttribute();
+ r.unconsume();
+ t.transition(AttributeName);
+ }
+ }
+ },
+ BeforeAttributeValue {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ case ' ':
+ // ignore
+ break;
+ case '"':
+ t.transition(AttributeValue_doubleQuoted);
+ break;
+ case '&':
+ r.unconsume();
+ t.transition(AttributeValue_unquoted);
+ break;
+ case '\'':
+ t.transition(AttributeValue_singleQuoted);
+ break;
+ case nullChar:
+ t.error(this);
+ t.tagPending.appendAttributeValue(replacementChar);
+ t.transition(AttributeValue_unquoted);
+ break;
+ case eof:
+ t.eofError(this);
+ t.emitTagPending();
+ t.transition(Data);
+ break;
+ case '>':
+ t.error(this);
+ t.emitTagPending();
+ t.transition(Data);
+ break;
+ case '<':
+ case '=':
+ case '`':
+ t.error(this);
+ t.tagPending.appendAttributeValue(c);
+ t.transition(AttributeValue_unquoted);
+ break;
+ default:
+ r.unconsume();
+ t.transition(AttributeValue_unquoted);
+ }
+ }
+ },
+ AttributeValue_doubleQuoted {
+ void read(Tokeniser t, CharacterReader r) {
+ String value = r.consumeToAny(attributeDoubleValueCharsSorted);
+ if (value.length() > 0)
+ t.tagPending.appendAttributeValue(value);
+ else
+ t.tagPending.setEmptyAttributeValue();
+
+ char c = r.consume();
+ switch (c) {
+ case '"':
+ t.transition(AfterAttributeValue_quoted);
+ break;
+ case '&':
+ int[] ref = t.consumeCharacterReference('"', true);
+ if (ref != null)
+ t.tagPending.appendAttributeValue(ref);
+ else
+ t.tagPending.appendAttributeValue('&');
+ break;
+ case nullChar:
+ t.error(this);
+ t.tagPending.appendAttributeValue(replacementChar);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ default: // hit end of buffer in first read, still in attribute
+ t.tagPending.appendAttributeValue(c);
+ }
+ }
+ },
+ AttributeValue_singleQuoted {
+ void read(Tokeniser t, CharacterReader r) {
+ String value = r.consumeToAny(attributeSingleValueCharsSorted);
+ if (value.length() > 0)
+ t.tagPending.appendAttributeValue(value);
+ else
+ t.tagPending.setEmptyAttributeValue();
+
+ char c = r.consume();
+ switch (c) {
+ case '\'':
+ t.transition(AfterAttributeValue_quoted);
+ break;
+ case '&':
+ int[] ref = t.consumeCharacterReference('\'', true);
+ if (ref != null)
+ t.tagPending.appendAttributeValue(ref);
+ else
+ t.tagPending.appendAttributeValue('&');
+ break;
+ case nullChar:
+ t.error(this);
+ t.tagPending.appendAttributeValue(replacementChar);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ default: // hit end of buffer in first read, still in attribute
+ t.tagPending.appendAttributeValue(c);
+ }
+ }
+ },
+ AttributeValue_unquoted {
+ void read(Tokeniser t, CharacterReader r) {
+ String value = r.consumeToAnySorted(attributeValueUnquoted);
+ if (value.length() > 0)
+ t.tagPending.appendAttributeValue(value);
+
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ case ' ':
+ t.transition(BeforeAttributeName);
+ break;
+ case '&':
+ int[] ref = t.consumeCharacterReference('>', true);
+ if (ref != null)
+ t.tagPending.appendAttributeValue(ref);
+ else
+ t.tagPending.appendAttributeValue('&');
+ break;
+ case '>':
+ t.emitTagPending();
+ t.transition(Data);
+ break;
+ case nullChar:
+ t.error(this);
+ t.tagPending.appendAttributeValue(replacementChar);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ case '"':
+ case '\'':
+ case '<':
+ case '=':
+ case '`':
+ t.error(this);
+ t.tagPending.appendAttributeValue(c);
+ break;
+ default: // hit end of buffer in first read, still in attribute
+ t.tagPending.appendAttributeValue(c);
+ }
+
+ }
+ },
+ // CharacterReferenceInAttributeValue state handled inline
+ AfterAttributeValue_quoted {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ case ' ':
+ t.transition(BeforeAttributeName);
+ break;
+ case '/':
+ t.transition(SelfClosingStartTag);
+ break;
+ case '>':
+ t.emitTagPending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ r.unconsume();
+ t.transition(BeforeAttributeName);
+ }
+
+ }
+ },
+ SelfClosingStartTag {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '>':
+ t.tagPending.selfClosing = true;
+ t.emitTagPending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ r.unconsume();
+ t.transition(BeforeAttributeName);
+ }
+ }
+ },
+ BogusComment {
+ void read(Tokeniser t, CharacterReader r) {
+ // todo: handle bogus comment starting from eof. when does that trigger?
+ // rewind to capture character that lead us here
+ r.unconsume();
+ Token.Comment comment = new Token.Comment();
+ comment.bogus = true;
+ comment.data.append(r.consumeTo('>'));
+ // todo: replace nullChar with replaceChar
+ t.emit(comment);
+ t.advanceTransition(Data);
+ }
+ },
+ MarkupDeclarationOpen {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matchConsume("--")) {
+ t.createCommentPending();
+ t.transition(CommentStart);
+ } else if (r.matchConsumeIgnoreCase("DOCTYPE")) {
+ t.transition(Doctype);
+ } else if (r.matchConsume("[CDATA[")) {
+ // todo: should actually check current namepspace, and only non-html allows cdata. until namespace
+ // is implemented properly, keep handling as cdata
+ //} else if (!t.currentNodeInHtmlNS() && r.matchConsume("[CDATA[")) {
+ t.createTempBuffer();
+ t.transition(CdataSection);
+ } else {
+ t.error(this);
+ t.advanceTransition(BogusComment); // advance so this character gets in bogus comment data's rewind
+ }
+ }
+ },
+ CommentStart {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '-':
+ t.transition(CommentStartDash);
+ break;
+ case nullChar:
+ t.error(this);
+ t.commentPending.data.append(replacementChar);
+ t.transition(Comment);
+ break;
+ case '>':
+ t.error(this);
+ t.emitCommentPending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.emitCommentPending();
+ t.transition(Data);
+ break;
+ default:
+ t.commentPending.data.append(c);
+ t.transition(Comment);
+ }
+ }
+ },
+ CommentStartDash {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '-':
+ t.transition(CommentStartDash);
+ break;
+ case nullChar:
+ t.error(this);
+ t.commentPending.data.append(replacementChar);
+ t.transition(Comment);
+ break;
+ case '>':
+ t.error(this);
+ t.emitCommentPending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.emitCommentPending();
+ t.transition(Data);
+ break;
+ default:
+ t.commentPending.data.append(c);
+ t.transition(Comment);
+ }
+ }
+ },
+ Comment {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.current();
+ switch (c) {
+ case '-':
+ t.advanceTransition(CommentEndDash);
+ break;
+ case nullChar:
+ t.error(this);
+ r.advance();
+ t.commentPending.data.append(replacementChar);
+ break;
+ case eof:
+ t.eofError(this);
+ t.emitCommentPending();
+ t.transition(Data);
+ break;
+ default:
+ t.commentPending.data.append(r.consumeToAny('-', nullChar));
+ }
+ }
+ },
+ CommentEndDash {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '-':
+ t.transition(CommentEnd);
+ break;
+ case nullChar:
+ t.error(this);
+ t.commentPending.data.append('-').append(replacementChar);
+ t.transition(Comment);
+ break;
+ case eof:
+ t.eofError(this);
+ t.emitCommentPending();
+ t.transition(Data);
+ break;
+ default:
+ t.commentPending.data.append('-').append(c);
+ t.transition(Comment);
+ }
+ }
+ },
+ CommentEnd {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '>':
+ t.emitCommentPending();
+ t.transition(Data);
+ break;
+ case nullChar:
+ t.error(this);
+ t.commentPending.data.append("--").append(replacementChar);
+ t.transition(Comment);
+ break;
+ case '!':
+ t.error(this);
+ t.transition(CommentEndBang);
+ break;
+ case '-':
+ t.error(this);
+ t.commentPending.data.append('-');
+ break;
+ case eof:
+ t.eofError(this);
+ t.emitCommentPending();
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ t.commentPending.data.append("--").append(c);
+ t.transition(Comment);
+ }
+ }
+ },
+ CommentEndBang {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '-':
+ t.commentPending.data.append("--!");
+ t.transition(CommentEndDash);
+ break;
+ case '>':
+ t.emitCommentPending();
+ t.transition(Data);
+ break;
+ case nullChar:
+ t.error(this);
+ t.commentPending.data.append("--!").append(replacementChar);
+ t.transition(Comment);
+ break;
+ case eof:
+ t.eofError(this);
+ t.emitCommentPending();
+ t.transition(Data);
+ break;
+ default:
+ t.commentPending.data.append("--!").append(c);
+ t.transition(Comment);
+ }
+ }
+ },
+ Doctype {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ case ' ':
+ t.transition(BeforeDoctypeName);
+ break;
+ case eof:
+ t.eofError(this);
+ // note: fall through to > case
+ case '>': // catch invalid
+ t.error(this);
+ t.createDoctypePending();
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ t.transition(BeforeDoctypeName);
+ }
+ }
+ },
+ BeforeDoctypeName {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matchesLetter()) {
+ t.createDoctypePending();
+ t.transition(DoctypeName);
+ return;
+ }
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ case ' ':
+ break; // ignore whitespace
+ case nullChar:
+ t.error(this);
+ t.createDoctypePending();
+ t.doctypePending.name.append(replacementChar);
+ t.transition(DoctypeName);
+ break;
+ case eof:
+ t.eofError(this);
+ t.createDoctypePending();
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.createDoctypePending();
+ t.doctypePending.name.append(c);
+ t.transition(DoctypeName);
+ }
+ }
+ },
+ DoctypeName {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.matchesLetter()) {
+ String name = r.consumeLetterSequence();
+ t.doctypePending.name.append(name);
+ return;
+ }
+ char c = r.consume();
+ switch (c) {
+ case '>':
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ case ' ':
+ t.transition(AfterDoctypeName);
+ break;
+ case nullChar:
+ t.error(this);
+ t.doctypePending.name.append(replacementChar);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.doctypePending.name.append(c);
+ }
+ }
+ },
+ AfterDoctypeName {
+ void read(Tokeniser t, CharacterReader r) {
+ if (r.isEmpty()) {
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ return;
+ }
+ if (r.matchesAny('\t', '\n', '\r', '\f', ' '))
+ r.advance(); // ignore whitespace
+ else if (r.matches('>')) {
+ t.emitDoctypePending();
+ t.advanceTransition(Data);
+ } else if (r.matchConsumeIgnoreCase(DocumentType.PUBLIC_KEY)) {
+ t.doctypePending.pubSysKey = DocumentType.PUBLIC_KEY;
+ t.transition(AfterDoctypePublicKeyword);
+ } else if (r.matchConsumeIgnoreCase(DocumentType.SYSTEM_KEY)) {
+ t.doctypePending.pubSysKey = DocumentType.SYSTEM_KEY;
+ t.transition(AfterDoctypeSystemKeyword);
+ } else {
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.advanceTransition(BogusDoctype);
+ }
+
+ }
+ },
+ AfterDoctypePublicKeyword {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ case ' ':
+ t.transition(BeforeDoctypePublicIdentifier);
+ break;
+ case '"':
+ t.error(this);
+ // set public id to empty string
+ t.transition(DoctypePublicIdentifier_doubleQuoted);
+ break;
+ case '\'':
+ t.error(this);
+ // set public id to empty string
+ t.transition(DoctypePublicIdentifier_singleQuoted);
+ break;
+ case '>':
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.transition(BogusDoctype);
+ }
+ }
+ },
+ BeforeDoctypePublicIdentifier {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ case ' ':
+ break;
+ case '"':
+ // set public id to empty string
+ t.transition(DoctypePublicIdentifier_doubleQuoted);
+ break;
+ case '\'':
+ // set public id to empty string
+ t.transition(DoctypePublicIdentifier_singleQuoted);
+ break;
+ case '>':
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.transition(BogusDoctype);
+ }
+ }
+ },
+ DoctypePublicIdentifier_doubleQuoted {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '"':
+ t.transition(AfterDoctypePublicIdentifier);
+ break;
+ case nullChar:
+ t.error(this);
+ t.doctypePending.publicIdentifier.append(replacementChar);
+ break;
+ case '>':
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.doctypePending.publicIdentifier.append(c);
+ }
+ }
+ },
+ DoctypePublicIdentifier_singleQuoted {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\'':
+ t.transition(AfterDoctypePublicIdentifier);
+ break;
+ case nullChar:
+ t.error(this);
+ t.doctypePending.publicIdentifier.append(replacementChar);
+ break;
+ case '>':
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.doctypePending.publicIdentifier.append(c);
+ }
+ }
+ },
+ AfterDoctypePublicIdentifier {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ case ' ':
+ t.transition(BetweenDoctypePublicAndSystemIdentifiers);
+ break;
+ case '>':
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case '"':
+ t.error(this);
+ // system id empty
+ t.transition(DoctypeSystemIdentifier_doubleQuoted);
+ break;
+ case '\'':
+ t.error(this);
+ // system id empty
+ t.transition(DoctypeSystemIdentifier_singleQuoted);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.transition(BogusDoctype);
+ }
+ }
+ },
+ BetweenDoctypePublicAndSystemIdentifiers {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ case ' ':
+ break;
+ case '>':
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case '"':
+ t.error(this);
+ // system id empty
+ t.transition(DoctypeSystemIdentifier_doubleQuoted);
+ break;
+ case '\'':
+ t.error(this);
+ // system id empty
+ t.transition(DoctypeSystemIdentifier_singleQuoted);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.transition(BogusDoctype);
+ }
+ }
+ },
+ AfterDoctypeSystemKeyword {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ case ' ':
+ t.transition(BeforeDoctypeSystemIdentifier);
+ break;
+ case '>':
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case '"':
+ t.error(this);
+ // system id empty
+ t.transition(DoctypeSystemIdentifier_doubleQuoted);
+ break;
+ case '\'':
+ t.error(this);
+ // system id empty
+ t.transition(DoctypeSystemIdentifier_singleQuoted);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ }
+ }
+ },
+ BeforeDoctypeSystemIdentifier {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ case ' ':
+ break;
+ case '"':
+ // set system id to empty string
+ t.transition(DoctypeSystemIdentifier_doubleQuoted);
+ break;
+ case '\'':
+ // set public id to empty string
+ t.transition(DoctypeSystemIdentifier_singleQuoted);
+ break;
+ case '>':
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.transition(BogusDoctype);
+ }
+ }
+ },
+ DoctypeSystemIdentifier_doubleQuoted {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '"':
+ t.transition(AfterDoctypeSystemIdentifier);
+ break;
+ case nullChar:
+ t.error(this);
+ t.doctypePending.systemIdentifier.append(replacementChar);
+ break;
+ case '>':
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.doctypePending.systemIdentifier.append(c);
+ }
+ }
+ },
+ DoctypeSystemIdentifier_singleQuoted {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\'':
+ t.transition(AfterDoctypeSystemIdentifier);
+ break;
+ case nullChar:
+ t.error(this);
+ t.doctypePending.systemIdentifier.append(replacementChar);
+ break;
+ case '>':
+ t.error(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.doctypePending.systemIdentifier.append(c);
+ }
+ }
+ },
+ AfterDoctypeSystemIdentifier {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ case ' ':
+ break;
+ case '>':
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.eofError(this);
+ t.doctypePending.forceQuirks = true;
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ t.error(this);
+ t.transition(BogusDoctype);
+ // NOT force quirks
+ }
+ }
+ },
+ BogusDoctype {
+ void read(Tokeniser t, CharacterReader r) {
+ char c = r.consume();
+ switch (c) {
+ case '>':
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ case eof:
+ t.emitDoctypePending();
+ t.transition(Data);
+ break;
+ default:
+ // ignore char
+ break;
+ }
+ }
+ },
+ CdataSection {
+ void read(Tokeniser t, CharacterReader r) {
+ String data = r.consumeTo("]]>");
+ t.dataBuffer.append(data);
+ if (r.matchConsume("]]>") || r.isEmpty()) {
+ t.emit(new Token.CData(t.dataBuffer.toString()));
+ t.transition(Data);
+ }// otherwise, buffer underrun, stay in data section
+ }
+ };
+
+
+ abstract void read(Tokeniser t, CharacterReader r);
+
+ static final char nullChar = '\u0000';
+ // char searches. must be sorted, used in inSorted. MUST update TokenisetStateTest if more arrays are added.
+ static final char[] attributeSingleValueCharsSorted = new char[]{nullChar, '&', '\''};
+ static final char[] attributeDoubleValueCharsSorted = new char[]{nullChar, '"', '&'};
+ static final char[] attributeNameCharsSorted = new char[]{nullChar, '\t', '\n', '\f', '\r', ' ', '"', '\'', '/', '<', '=', '>'};
+ static final char[] attributeValueUnquoted = new char[]{nullChar, '\t', '\n', '\f', '\r', ' ', '"', '&', '\'', '<', '=', '>', '`'};
+
+ private static final char replacementChar = Tokeniser.replacementChar;
+ private static final String replacementStr = String.valueOf(Tokeniser.replacementChar);
+ private static final char eof = CharacterReader.EOF;
+
+ /**
+ * Handles RawtextEndTagName, ScriptDataEndTagName, and ScriptDataEscapedEndTagName. Same body impl, just
+ * different else exit transitions.
+ */
+ private static void handleDataEndTag(Tokeniser t, CharacterReader r, TokeniserState elseTransition) {
+ if (r.matchesLetter()) {
+ String name = r.consumeLetterSequence();
+ t.tagPending.appendTagName(name);
+ t.dataBuffer.append(name);
+ return;
+ }
+
+ boolean needsExitTransition = false;
+ if (t.isAppropriateEndTagToken() && !r.isEmpty()) {
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ case ' ':
+ t.transition(BeforeAttributeName);
+ break;
+ case '/':
+ t.transition(SelfClosingStartTag);
+ break;
+ case '>':
+ t.emitTagPending();
+ t.transition(Data);
+ break;
+ default:
+ t.dataBuffer.append(c);
+ needsExitTransition = true;
+ }
+ } else {
+ needsExitTransition = true;
+ }
+
+ if (needsExitTransition) {
+ t.emit("" + t.dataBuffer.toString());
+ t.transition(elseTransition);
+ }
+ }
+
+ private static void readData(Tokeniser t, CharacterReader r, TokeniserState current, TokeniserState advance) {
+ switch (r.current()) {
+ case '<':
+ t.advanceTransition(advance);
+ break;
+ case nullChar:
+ t.error(current);
+ r.advance();
+ t.emit(replacementChar);
+ break;
+ case eof:
+ t.emit(new Token.EOF());
+ break;
+ default:
+ String data = r.consumeToAny('<', nullChar); // todo - why hunt for null here? Just consumeTo'<'?
+ t.emit(data);
+ break;
+ }
+ }
+
+ private static void readCharRef(Tokeniser t, TokeniserState advance) {
+ int[] c = t.consumeCharacterReference(null, false);
+ if (c == null)
+ t.emit('&');
+ else
+ t.emit(c);
+ t.transition(advance);
+ }
+
+ private static void readEndTag(Tokeniser t, CharacterReader r, TokeniserState a, TokeniserState b) {
+ if (r.matchesLetter()) {
+ t.createTagPending(false);
+ t.transition(a);
+ } else {
+ t.emit("");
+ t.transition(b);
+ }
+ }
+
+ private static void handleDataDoubleEscapeTag(Tokeniser t, CharacterReader r, TokeniserState primary, TokeniserState fallback) {
+ if (r.matchesLetter()) {
+ String name = r.consumeLetterSequence();
+ t.dataBuffer.append(name);
+ t.emit(name);
+ return;
+ }
+
+ char c = r.consume();
+ switch (c) {
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ case ' ':
+ case '/':
+ case '>':
+ if (t.dataBuffer.toString().equals("script"))
+ t.transition(primary);
+ else
+ t.transition(fallback);
+ t.emit(c);
+ break;
+ default:
+ r.unconsume();
+ t.transition(fallback);
+ }
+ }
+}
diff --git a/settings.gradle b/settings.gradle
index 29dc38f9..11a2f10c 100644
--- a/settings.gradle
+++ b/settings.gradle
@@ -1 +1,2 @@
-include ':app', ':library', ':library-image-loader', ':library-view', ':sample-custom-extension', ':library-syntax'
+include ':app', ':library', ':library-image-loader', ':library-view', ':sample-custom-extension',
+ ':library-syntax', ':html-parser-api', ':html-parser-impl'