From ff3bedc37eaac85276f4899c366cbe3710318a67 Mon Sep 17 00:00:00 2001 From: Dimitry Ivanov Date: Fri, 17 Aug 2018 12:53:36 +0300 Subject: [PATCH] Added 2 modules: html-parser-api and html-parser-impl --- html-parser-api/build.gradle | 31 + html-parser-api/src/main/AndroidManifest.xml | 1 + .../java/ru/noties/markwon/html/HtmlTag.java | 54 + .../markwon/html/MarkwonHtmlParser.java | 36 + .../markwon/html/MarkwonHtmlParserNoOp.java | 26 + html-parser-impl/build.gradle | 32 + html-parser-impl/src/main/AndroidManifest.xml | 1 + .../ru/noties/markwon/html/HtmlTagImpl.java | 117 ++ .../markwon/html/MarkwonHtmlParserImpl.java | 396 ++++ .../html/jsoup/UncheckedIOException.java | 13 + .../markwon/html/jsoup/helper/Normalizer.java | 18 + .../markwon/html/jsoup/helper/Validate.java | 112 ++ .../markwon/html/jsoup/nodes/Attribute.java | 202 ++ .../markwon/html/jsoup/nodes/Attributes.java | 444 +++++ .../html/jsoup/nodes/DocumentType.java | 104 + .../markwon/html/jsoup/nodes/Entities.java | 351 ++++ .../html/jsoup/nodes/EntitiesData.java | 11 + .../html/jsoup/parser/CharacterReader.java | 483 +++++ .../markwon/html/jsoup/parser/ParseError.java | 41 + .../html/jsoup/parser/ParseErrorList.java | 34 + .../markwon/html/jsoup/parser/Token.java | 398 ++++ .../markwon/html/jsoup/parser/Tokeniser.java | 295 +++ .../html/jsoup/parser/TokeniserState.java | 1737 +++++++++++++++++ settings.gradle | 3 +- 24 files changed, 4939 insertions(+), 1 deletion(-) create mode 100644 html-parser-api/build.gradle create mode 100644 html-parser-api/src/main/AndroidManifest.xml create mode 100644 html-parser-api/src/main/java/ru/noties/markwon/html/HtmlTag.java create mode 100644 html-parser-api/src/main/java/ru/noties/markwon/html/MarkwonHtmlParser.java create mode 100644 html-parser-api/src/main/java/ru/noties/markwon/html/MarkwonHtmlParserNoOp.java create mode 100644 html-parser-impl/build.gradle create mode 100644 html-parser-impl/src/main/AndroidManifest.xml create mode 100644 html-parser-impl/src/main/java/ru/noties/markwon/html/HtmlTagImpl.java create mode 100644 html-parser-impl/src/main/java/ru/noties/markwon/html/MarkwonHtmlParserImpl.java create mode 100644 html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/UncheckedIOException.java create mode 100644 html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/helper/Normalizer.java create mode 100644 html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/helper/Validate.java create mode 100644 html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Attribute.java create mode 100644 html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Attributes.java create mode 100644 html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/DocumentType.java create mode 100644 html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Entities.java create mode 100644 html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/EntitiesData.java create mode 100644 html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/CharacterReader.java create mode 100644 html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/ParseError.java create mode 100644 html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/ParseErrorList.java create mode 100644 html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/Token.java create mode 100644 html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/Tokeniser.java create mode 100644 html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/TokeniserState.java diff --git a/html-parser-api/build.gradle b/html-parser-api/build.gradle new file mode 100644 index 00000000..53bc9eee --- /dev/null +++ b/html-parser-api/build.gradle @@ -0,0 +1,31 @@ +apply plugin: 'com.android.library' + +android { + + compileSdkVersion TARGET_SDK + buildToolsVersion BUILD_TOOLS + + defaultConfig { + minSdkVersion MIN_SDK + targetSdkVersion TARGET_SDK + versionCode 1 + versionName version + } +} + +dependencies { + api SUPPORT_ANNOTATIONS +} + +afterEvaluate { + generateReleaseBuildConfig.enabled = false +} + +// todo: remove `local` check after merge with latest version (1.1.1) +if (hasProperty('release')) { + if (hasProperty('local')) { + ext.RELEASE_REPOSITORY_URL = LOCAL_MAVEN_URL + ext.SNAPSHOT_REPOSITORY_URL = LOCAL_MAVEN_URL + } + apply from: 'https://raw.githubusercontent.com/noties/gradle-mvn-push/master/gradle-mvn-push-aar.gradle' +} diff --git a/html-parser-api/src/main/AndroidManifest.xml b/html-parser-api/src/main/AndroidManifest.xml new file mode 100644 index 00000000..6d886e0e --- /dev/null +++ b/html-parser-api/src/main/AndroidManifest.xml @@ -0,0 +1 @@ + diff --git a/html-parser-api/src/main/java/ru/noties/markwon/html/HtmlTag.java b/html-parser-api/src/main/java/ru/noties/markwon/html/HtmlTag.java new file mode 100644 index 00000000..896f0d40 --- /dev/null +++ b/html-parser-api/src/main/java/ru/noties/markwon/html/HtmlTag.java @@ -0,0 +1,54 @@ +package ru.noties.markwon.html; + +import android.support.annotation.NonNull; +import android.support.annotation.Nullable; + +import java.util.List; + +/** + * @see Inline + * @see Block + */ +public interface HtmlTag { + + /** + * @return normalized tag name (lower-case) + */ + @NonNull + String name(); + + /** + * @return index at which this tag starts + */ + int start(); + + /** + * @return index at which this tag ends + */ + int end(); + + /** + * Represents really inline HTML tags (unline commonmark definitions) + */ + interface Inline extends HtmlTag { + } + + /** + * Represents HTML block tags. Please note that all tags that are not inline should be + * considered as block tags + */ + interface Block extends HtmlTag { + + /** + * @return parent {@link Block} or null if there is no parent (this block is at root level) + */ + @Nullable + Block parent(); + + /** + * @return list of children + */ + @NonNull + List children(); + } +} diff --git a/html-parser-api/src/main/java/ru/noties/markwon/html/MarkwonHtmlParser.java b/html-parser-api/src/main/java/ru/noties/markwon/html/MarkwonHtmlParser.java new file mode 100644 index 00000000..cc168b87 --- /dev/null +++ b/html-parser-api/src/main/java/ru/noties/markwon/html/MarkwonHtmlParser.java @@ -0,0 +1,36 @@ +package ru.noties.markwon.html; + +import android.support.annotation.NonNull; + +import java.util.List; + +public abstract class MarkwonHtmlParser { + + @NonNull + public static MarkwonHtmlParser noOp() { + return new MarkwonHtmlParserNoOp(); + } + + public interface FlushAction { + void apply(@NonNull List tags); + } + + public abstract void processFragment( + @NonNull T output, + @NonNull String htmlFragment); + + // clear all pending tags (if any) + // todo: we also can do this: if supplied value is -1 (for example) we ignore tags that are not closed + public abstract void flushInlineTags( + int documentLength, + @NonNull FlushAction action); + + // clear all pending blocks if any + // todo: we also can do this: if supplied value is -1 (for example) we ignore tags that are not closed + public abstract void flushBlockTags( + int documentLength, + @NonNull FlushAction action); + + public abstract void reset(); + +} diff --git a/html-parser-api/src/main/java/ru/noties/markwon/html/MarkwonHtmlParserNoOp.java b/html-parser-api/src/main/java/ru/noties/markwon/html/MarkwonHtmlParserNoOp.java new file mode 100644 index 00000000..56926d12 --- /dev/null +++ b/html-parser-api/src/main/java/ru/noties/markwon/html/MarkwonHtmlParserNoOp.java @@ -0,0 +1,26 @@ +package ru.noties.markwon.html; + +import android.support.annotation.NonNull; + +class MarkwonHtmlParserNoOp extends MarkwonHtmlParser { + + @Override + public void processFragment(@NonNull T output, @NonNull String htmlFragment) { + + } + + @Override + public void flushInlineTags(int documentLength, @NonNull FlushAction action) { + + } + + @Override + public void flushBlockTags(int documentLength, @NonNull FlushAction action) { + + } + + @Override + public void reset() { + + } +} diff --git a/html-parser-impl/build.gradle b/html-parser-impl/build.gradle new file mode 100644 index 00000000..f1c8860f --- /dev/null +++ b/html-parser-impl/build.gradle @@ -0,0 +1,32 @@ +apply plugin: 'com.android.library' + +android { + + compileSdkVersion TARGET_SDK + buildToolsVersion BUILD_TOOLS + + defaultConfig { + minSdkVersion MIN_SDK + targetSdkVersion TARGET_SDK + versionCode 1 + versionName version + } +} + +dependencies { + api SUPPORT_ANNOTATIONS + api project(':html-parser-api') +} + +afterEvaluate { + generateReleaseBuildConfig.enabled = false +} + +// todo: remove `local` check after merge with latest version (1.1.1) +if (hasProperty('release')) { + if (hasProperty('local')) { + ext.RELEASE_REPOSITORY_URL = LOCAL_MAVEN_URL + ext.SNAPSHOT_REPOSITORY_URL = LOCAL_MAVEN_URL + } + apply from: 'https://raw.githubusercontent.com/noties/gradle-mvn-push/master/gradle-mvn-push-aar.gradle' +} diff --git a/html-parser-impl/src/main/AndroidManifest.xml b/html-parser-impl/src/main/AndroidManifest.xml new file mode 100644 index 00000000..6d886e0e --- /dev/null +++ b/html-parser-impl/src/main/AndroidManifest.xml @@ -0,0 +1 @@ + diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/HtmlTagImpl.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/HtmlTagImpl.java new file mode 100644 index 00000000..3f8083dd --- /dev/null +++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/HtmlTagImpl.java @@ -0,0 +1,117 @@ +package ru.noties.markwon.html; + +import android.support.annotation.NonNull; +import android.support.annotation.Nullable; + +import java.util.Collections; +import java.util.List; + +abstract class HtmlTagImpl implements HtmlTag { + + static final int NO_VALUE = -1; + + final String name; + final int start; + int end = NO_VALUE; + + protected HtmlTagImpl(@NonNull String name, int start) { + this.name = name; + this.start = start; + } + + @NonNull + @Override + public String name() { + return name; + } + + @Override + public int start() { + return start; + } + + @Override + public int end() { + return end; + } + + boolean isClosed() { + return end > NO_VALUE; + } + + abstract void closeAt(int end); + + + static class InlineImpl extends HtmlTagImpl implements Inline { + + InlineImpl(@NonNull String name, int start) { + super(name, start); + } + + @Override + void closeAt(int end) { + if (!isClosed()) { + super.end = end; + } + } + } + + static class BlockImpl extends HtmlTagImpl implements Block { + + @NonNull + static BlockImpl root() { + //noinspection ConstantConditions + return new BlockImpl("", 0, null); + } + + @NonNull + static BlockImpl create(@NonNull String name, int start, @NonNull BlockImpl parent) { + return new BlockImpl(name, start, parent); + } + + final BlockImpl parent; + List children; + + @SuppressWarnings("NullableProblems") + BlockImpl(@NonNull String name, int start, @NonNull BlockImpl parent) { + super(name, start); + this.parent = parent; + } + + @Override + void closeAt(int end) { + if (!isClosed()) { + super.end = end; + if (children != null) { + for (BlockImpl child: children) { + child.closeAt(end); + } + children = Collections.unmodifiableList(children); + } else { + children = Collections.emptyList(); + } + } + } + + boolean isRoot() { + return parent == null; + } + + @Nullable + @Override + public Block parent() { + if (parent == null) { + throw new IllegalStateException("#parent() getter was called on the root node " + + "which should not be exposed outside internal usage"); + } + return parent; + } + + @NonNull + @Override + public List children() { + //noinspection unchecked + return (List) (List) children; + } + } +} diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/MarkwonHtmlParserImpl.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/MarkwonHtmlParserImpl.java new file mode 100644 index 00000000..c2813775 --- /dev/null +++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/MarkwonHtmlParserImpl.java @@ -0,0 +1,396 @@ +package ru.noties.markwon.html; + +import android.support.annotation.NonNull; +import android.support.annotation.Nullable; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import ru.noties.markwon.html.HtmlTag.Block; +import ru.noties.markwon.html.HtmlTag.Inline; +import ru.noties.markwon.html.HtmlTagImpl.BlockImpl; +import ru.noties.markwon.html.HtmlTagImpl.InlineImpl; +import ru.noties.markwon.html.jsoup.parser.CharacterReader; +import ru.noties.markwon.html.jsoup.parser.ParseErrorList; +import ru.noties.markwon.html.jsoup.parser.Token; +import ru.noties.markwon.html.jsoup.parser.Tokeniser; + +public class MarkwonHtmlParserImpl extends MarkwonHtmlParser { + + @NonNull + public static MarkwonHtmlParserImpl create() { + return new MarkwonHtmlParserImpl(); + } + + // https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements + private static final Set INLINE_TAGS; + + private static final Set VOID_TAGS; + + // these are the tags that are considered _block_ ones + // this parser will ensure that these blocks are started on a new line + // other tags that are NOT inline are considered as block tags, but won't have new line + // inserted before them + // https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements + private static final Set BLOCK_TAGS; + + private static final String TAG_PARAGRAPH = "p"; + private static final String TAG_LIST_ITEM = "li"; + + // todo: make it configurable + private static final String IMG_REPLACEMENT = "\uFFFC"; + + static { + INLINE_TAGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList( + "a", "abbr", "acronym", + "b", "bdo", "big", "br", "button", + "cite", "code", + "dfn", + "em", + "i", "img", "input", + "kbd", + "label", + "map", + "object", + "q", + "samp", "script", "select", "small", "span", "strong", "sub", "sup", + "textarea", "time", "tt", + "var" + ))); + VOID_TAGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList( + "area", + "base", "br", + "col", + "embed", + "hr", + "img", "input", + "keygen", + "link", + "meta", + "param", + "source", + "track", + "wbr" + ))); + BLOCK_TAGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList( + "address", "article", "aside", + "blockquote", + "canvas", + "dd", "div", "dl", "dt", + "fieldset", "figcaption", "figure", "footer", "form", + "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr", + "li", + "main", + "nav", "noscript", + "ol", "output", + "p", "pre", + "section", + "table", "tfoot", + "ul", + "video" + ))); + } + + private final List inlineTags = new ArrayList<>(0); + + private BlockImpl currentBlock = BlockImpl.root(); + + + @Override + public void processFragment( + @NonNull T output, + @NonNull String htmlFragment) { + + // todo: maybe there is a way to reuse tokeniser... + final Tokeniser tokeniser = new Tokeniser(new CharacterReader(htmlFragment), ParseErrorList.noTracking()); + + while (true) { + + final Token token = tokeniser.read(); + final Token.TokenType tokenType = token.type; + + if (Token.TokenType.EOF == tokenType) { + break; + } + + switch (tokenType) { + + case StartTag: { + + final Token.StartTag startTag = (Token.StartTag) token; + + if (isInlineTag(startTag.normalName)) { + processInlineTagStart(output, startTag); + } else { + processBlockTagStart(output, startTag); + } + } + break; + + case EndTag: { + + final Token.EndTag endTag = (Token.EndTag) token; + + if (isInlineTag(endTag.normalName)) { + processInlineTagEnd(output, endTag); + } else { + processBlockTagEnd(output, endTag); + } + } + break; + + case Character: { + processCharacter(output, ((Token.Character) token)); + } + break; + } + + // do not forget to reset processed token (even if it's not processed) + token.reset(); + } + } + + @Override + public void flushInlineTags(int documentLength, @NonNull FlushAction action) { + if (inlineTags.size() > 0) { + for (InlineImpl inline : inlineTags) { + inline.closeAt(documentLength); + } + //noinspection unchecked + action.apply(Collections.unmodifiableList((List) inlineTags)); + inlineTags.clear(); + } + } + + @Override + public void flushBlockTags(int documentLength, @NonNull FlushAction action) { + + BlockImpl block = currentBlock; + while (!block.isRoot()) { + block = block.parent; + } + + block.closeAt(documentLength); + + final List children = block.children(); + if (children.size() > 0) { + action.apply(children); + } + + currentBlock = BlockImpl.root(); + } + + @Override + public void reset() { + inlineTags.clear(); + currentBlock = BlockImpl.root(); + } + + + protected void processInlineTagStart( + @NonNull T output, + @NonNull Token.StartTag startTag) { + + final String name = startTag.normalName; + + final InlineImpl inline = new InlineImpl(name, output.length()); + + if (isVoidTag(name) + || startTag.selfClosing) { + + // check if we have content to append as we must close this tag here + processVoidTag(output, startTag); + + inline.end = output.length(); + } + + // actually only check if there is content for void/self-closing tags + // if none -> ignore it + if (inline.start != inline.end) { + inlineTags.add(inline); + } + } + + protected void processInlineTagEnd( + @NonNull T output, + @NonNull Token.EndTag endTag) { + + // try to find it, if none found -> ignore + final InlineImpl openInlineTag = findOpenInlineTag(endTag.normalName); + if (openInlineTag != null) { + // close open inline tag + openInlineTag.end = output.length(); + } + } + + + protected void processBlockTagStart( + @NonNull T output, + @NonNull Token.StartTag startTag) { + + final String name = startTag.normalName; + + // block tags (all that are NOT inline -> blocks + // I think there is only one strong rule -> paragraph cannot contain anything + // except inline tags + // also, closing paragraph with non-closed inlines -> doesn't close inlines + // they are continued for _afterwards_ + + if (TAG_PARAGRAPH.equals(currentBlock.name)) { + // it must be closed here not matter what we are as here we _assume_ + // that it's a block tag + append(output, "\n"); + currentBlock.end = output.length(); + currentBlock = currentBlock.parent; + } else if (TAG_LIST_ITEM.equals(name) + && TAG_LIST_ITEM.equals(currentBlock.name)) { + // close previous list item if in the same parent + currentBlock.end = output.length(); + currentBlock = currentBlock.parent; + } + + if (isBlockTag(name)) { + ensureNewLine(output); + } + + final int start = output.length(); + + final BlockImpl block = BlockImpl.create(name, start, currentBlock); + + //noinspection ConstantConditions + appendBlockChild(block.parent, block); + + this.currentBlock = block; + } + + protected void processBlockTagEnd( + @NonNull T output, + @NonNull Token.EndTag endTag) { + + final String name = endTag.normalName; + + final BlockImpl block = findOpenBlockTag(endTag.normalName); + if (block != null) { + + if (TAG_PARAGRAPH.equals(name)) { + append(output, "\n"); + } + + block.closeAt(output.length()); + this.currentBlock = block.parent; + } + } + + protected void processVoidTag( + @NonNull T output, + @NonNull Token.StartTag startTag) { + + final String name = startTag.normalName; + + if ("br".equals(name)) { + append(output, "\n"); + } else if ("img".equals(name)) { + final String alt = startTag.attributes.getIgnoreCase("alt"); + if (alt == null + || alt.length() == 0) { + // no alt is provided + append(output, IMG_REPLACEMENT); + } else { + append(output, alt); + } + } + + // other tags are ignored + } + + protected void processCharacter( + @NonNull T output, + @NonNull Token.Character character) { + + // the thing here is: if it's a script tag that we are inside -> we must not treat this + // as the text to append... should we even care about this? how many people are + // going to include freaking script tags as html inline? + // + // so tags are: BUTTON, INPUT, SELECT, SCRIPT, TEXTAREA + // + // actually we must decide it here: should we append freaking characters for these _bad_ + // tags or not, as later we won't be able to change it and/or allow modification (as + // all indexes will be affected with this) + + // for now: ignore the inline context + append(output, character.getData()); + } + + protected void appendBlockChild(@NonNull BlockImpl parent, @NonNull BlockImpl child) { + List children = parent.children; + if (children == null) { + children = new ArrayList<>(2); + parent.children = children; + } + children.add(child); + } + + @Nullable + protected InlineImpl findOpenInlineTag(@NonNull String name) { + + InlineImpl inline; + + for (int i = inlineTags.size() - 1; i > -1; i--) { + inline = inlineTags.get(i); + if (name.equals(inline.name) + && inline.end < 0) { + return inline; + } + } + + return null; + } + + @Nullable + protected BlockImpl findOpenBlockTag(@NonNull String name) { + + BlockImpl blockTag = currentBlock; + + while (blockTag != null + && !name.equals(blockTag.name)) { + blockTag = blockTag.parent; + } + + return blockTag; + } + + // name here must lower case + protected static boolean isInlineTag(@NonNull String name) { + return INLINE_TAGS.contains(name); + } + + protected static boolean isVoidTag(@NonNull String name) { + return VOID_TAGS.contains(name); + } + + protected static boolean isBlockTag(@NonNull String name) { + return BLOCK_TAGS.contains(name); + } + + protected static void append(@NonNull Appendable appendable, @NonNull CharSequence text) { + try { + appendable.append(text); + } catch (IOException e) { + // _must_ not happen + throw new RuntimeException(e); + } + } + + protected static void ensureNewLine(@NonNull T output) { + final int length = output.length(); + if (length > 0 + && '\n' != output.charAt(length - 1)) { + append(output, "\n"); + } + } +} diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/UncheckedIOException.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/UncheckedIOException.java new file mode 100644 index 00000000..9548bdf4 --- /dev/null +++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/UncheckedIOException.java @@ -0,0 +1,13 @@ +package ru.noties.markwon.html.jsoup; + +import java.io.IOException; + +public class UncheckedIOException extends RuntimeException { + public UncheckedIOException(IOException cause) { + super(cause); + } + + public IOException ioException() { + return (IOException) getCause(); + } +} diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/helper/Normalizer.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/helper/Normalizer.java new file mode 100644 index 00000000..a0df7dd4 --- /dev/null +++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/helper/Normalizer.java @@ -0,0 +1,18 @@ +package ru.noties.markwon.html.jsoup.helper; + +import java.util.Locale; + +/** + * Util methods for normalizing strings. Jsoup internal use only, please don't depend on this API. + */ +public final class Normalizer { + + public static String lowerCase(final String input) { + return input != null ? input.toLowerCase(Locale.ENGLISH) : ""; + } + + public static String normalize(final String input) { + return lowerCase(input).trim(); + } +} + diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/helper/Validate.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/helper/Validate.java new file mode 100644 index 00000000..0d00249b --- /dev/null +++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/helper/Validate.java @@ -0,0 +1,112 @@ +package ru.noties.markwon.html.jsoup.helper; + +/** + * Simple validation methods. Designed for jsoup internal use + */ +public final class Validate { + + private Validate() {} + + /** + * Validates that the object is not null + * @param obj object to test + */ + public static void notNull(Object obj) { + if (obj == null) + throw new IllegalArgumentException("Object must not be null"); + } + + /** + * Validates that the object is not null + * @param obj object to test + * @param msg message to output if validation fails + */ + public static void notNull(Object obj, String msg) { + if (obj == null) + throw new IllegalArgumentException(msg); + } + + /** + * Validates that the value is true + * @param val object to test + */ + public static void isTrue(boolean val) { + if (!val) + throw new IllegalArgumentException("Must be true"); + } + + /** + * Validates that the value is true + * @param val object to test + * @param msg message to output if validation fails + */ + public static void isTrue(boolean val, String msg) { + if (!val) + throw new IllegalArgumentException(msg); + } + + /** + * Validates that the value is false + * @param val object to test + */ + public static void isFalse(boolean val) { + if (val) + throw new IllegalArgumentException("Must be false"); + } + + /** + * Validates that the value is false + * @param val object to test + * @param msg message to output if validation fails + */ + public static void isFalse(boolean val, String msg) { + if (val) + throw new IllegalArgumentException(msg); + } + + /** + * Validates that the array contains no null elements + * @param objects the array to test + */ + public static void noNullElements(Object[] objects) { + noNullElements(objects, "Array must not contain any null objects"); + } + + /** + * Validates that the array contains no null elements + * @param objects the array to test + * @param msg message to output if validation fails + */ + public static void noNullElements(Object[] objects, String msg) { + for (Object obj : objects) + if (obj == null) + throw new IllegalArgumentException(msg); + } + + /** + * Validates that the string is not empty + * @param string the string to test + */ + public static void notEmpty(String string) { + if (string == null || string.length() == 0) + throw new IllegalArgumentException("String must not be empty"); + } + + /** + * Validates that the string is not empty + * @param string the string to test + * @param msg message to output if validation fails + */ + public static void notEmpty(String string, String msg) { + if (string == null || string.length() == 0) + throw new IllegalArgumentException(msg); + } + + /** + Cause a failure. + @param msg message to output. + */ + public static void fail(String msg) { + throw new IllegalArgumentException(msg); + } +} diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Attribute.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Attribute.java new file mode 100644 index 00000000..fea596e2 --- /dev/null +++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Attribute.java @@ -0,0 +1,202 @@ +package ru.noties.markwon.html.jsoup.nodes; + +import java.util.Map; + +import ru.noties.markwon.html.jsoup.helper.Validate; + +/** + A single key + value attribute. (Only used for presentation.) + */ +public class Attribute implements Map.Entry, Cloneable { +// private static final String[] booleanAttributes = { +// "allowfullscreen", "async", "autofocus", "checked", "compact", "declare", "default", "defer", "disabled", +// "formnovalidate", "hidden", "inert", "ismap", "itemscope", "multiple", "muted", "nohref", "noresize", +// "noshade", "novalidate", "nowrap", "open", "readonly", "required", "reversed", "seamless", "selected", +// "sortable", "truespeed", "typemustmatch" +// }; + + private String key; + private String val; + Attributes parent; // used to update the holding Attributes when the key / value is changed via this interface + + /** + * Create a new attribute from unencoded (raw) key and value. + * @param key attribute key; case is preserved. + * @param value attribute value + */ + public Attribute(String key, String value) { + this(key, value, null); + } + + /** + * Create a new attribute from unencoded (raw) key and value. + * @param key attribute key; case is preserved. + * @param val attribute value + * @param parent the containing Attributes (this Attribute is not automatically added to said Attributes) + */ + public Attribute(String key, String val, Attributes parent) { + Validate.notNull(key); + this.key = key.trim(); + Validate.notEmpty(key); // trimming could potentially make empty, so validate here + this.val = val; + this.parent = parent; + } + + /** + Get the attribute key. + @return the attribute key + */ + public String getKey() { + return key; + } + + /** + Set the attribute key; case is preserved. + @param key the new key; must not be null + */ + public void setKey(String key) { + Validate.notNull(key); + key = key.trim(); + Validate.notEmpty(key); // trimming could potentially make empty, so validate here + if (parent != null) { + int i = parent.indexOfKey(this.key); + if (i != Attributes.NotFound) + parent.keys[i] = key; + } + this.key = key; + } + + /** + Get the attribute value. + @return the attribute value + */ + public String getValue() { + return val; + } + + /** + Set the attribute value. + @param val the new attribute value; must not be null + */ + public String setValue(String val) { + String oldVal = parent.get(this.key); + if (parent != null) { + int i = parent.indexOfKey(this.key); + if (i != Attributes.NotFound) + parent.vals[i] = val; + } + this.val = val; + return oldVal; + } + +// /** +// Get the HTML representation of this attribute; e.g. {@code href="index.html"}. +// @return HTML +// */ +// public String html() { +// StringBuilder accum = new StringBuilder(); +// +// try { +// html(accum, (new Document("")).outputSettings()); +// } catch(IOException exception) { +// throw new SerializationException(exception); +// } +// return accum.toString(); +// } +// +// protected static void html(String key, String val, Appendable accum, Document.OutputSettings out) throws IOException { +// accum.append(key); +// if (!shouldCollapseAttribute(key, val, out)) { +// accum.append("=\""); +// Entities.escape(accum, Attributes.checkNotNull(val) , out, true, false, false); +// accum.append('"'); +// } +// } +// +// protected void html(Appendable accum, Document.OutputSettings out) throws IOException { +// html(key, val, accum, out); +// } + +// /** +// Get the string representation of this attribute, implemented as {@link #html()}. +// @return string +// */ +// @Override +// public String toString() { +// return html(); +// } + +// /** +// * Create a new Attribute from an unencoded key and a HTML attribute encoded value. +// * @param unencodedKey assumes the key is not encoded, as can be only run of simple \w chars. +// * @param encodedValue HTML attribute encoded value +// * @return attribute +// */ +// public static Attribute createFromEncoded(String unencodedKey, String encodedValue) { +// String value = Entities.unescape(encodedValue, true); +// return new Attribute(unencodedKey, value, null); // parent will get set when Put +// } + + protected boolean isDataAttribute() { + return isDataAttribute(key); + } + + protected static boolean isDataAttribute(String key) { + return key.startsWith(Attributes.dataPrefix) && key.length() > Attributes.dataPrefix.length(); + } + +// /** +// * Collapsible if it's a boolean attribute and value is empty or same as name +// * +// * @param out output settings +// * @return Returns whether collapsible or not +// */ +// protected final boolean shouldCollapseAttribute(Document.OutputSettings out) { +// return shouldCollapseAttribute(key, val, out); +// } + +// protected static boolean shouldCollapseAttribute(final String key, final String val, final Document.OutputSettings out) { +// return ( +// out.syntax() == Document.OutputSettings.Syntax.html && +// (val == null || ("".equals(val) || val.equalsIgnoreCase(key)) && Attribute.isBooleanAttribute(key))); +// } + +// /** +// * @deprecated +// */ +// protected boolean isBooleanAttribute() { +// return Arrays.binarySearch(booleanAttributes, key) >= 0 || val == null; +// } +// +// /** +// * Checks if this attribute name is defined as a boolean attribute in HTML5 +// */ +// protected static boolean isBooleanAttribute(final String key) { +// return Arrays.binarySearch(booleanAttributes, key) >= 0; +// } + + @Override + public boolean equals(Object o) { // note parent not considered + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Attribute attribute = (Attribute) o; + if (key != null ? !key.equals(attribute.key) : attribute.key != null) return false; + return val != null ? val.equals(attribute.val) : attribute.val == null; + } + + @Override + public int hashCode() { // note parent not considered + int result = key != null ? key.hashCode() : 0; + result = 31 * result + (val != null ? val.hashCode() : 0); + return result; + } + + @Override + public Attribute clone() { + try { + return (Attribute) super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + } +} diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Attributes.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Attributes.java new file mode 100644 index 00000000..f00ecfe1 --- /dev/null +++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Attributes.java @@ -0,0 +1,444 @@ +package ru.noties.markwon.html.jsoup.nodes; + +import java.util.AbstractMap; +import java.util.AbstractSet; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import ru.noties.markwon.html.jsoup.helper.Validate; + +import static ru.noties.markwon.html.jsoup.helper.Normalizer.lowerCase; + +/** + * The attributes of an Element. + *

+ * Attributes are treated as a map: there can be only one value associated with an attribute key/name. + *

+ *

+ * Attribute name and value comparisons are generally case sensitive. By default for HTML, attribute names are + * normalized to lower-case on parsing. That means you should use lower-case strings when referring to attributes by + * name. + *

+ * + * @author Jonathan Hedley, jonathan@hedley.net + */ +public class Attributes implements Iterable, Cloneable { + protected static final String dataPrefix = "data-"; + private static final int InitialCapacity = 4; // todo - analyze Alexa 1MM sites, determine best setting + + // manages the key/val arrays + private static final int GrowthFactor = 2; + private static final String[] Empty = {}; + static final int NotFound = -1; + private static final String EmptyString = ""; + + private int size = 0; // number of slots used (not capacity, which is keys.length + String[] keys = Empty; + String[] vals = Empty; + + // check there's room for more + private void checkCapacity(int minNewSize) { + Validate.isTrue(minNewSize >= size); + int curSize = keys.length; + if (curSize >= minNewSize) + return; + + int newSize = curSize >= InitialCapacity ? size * GrowthFactor : InitialCapacity; + if (minNewSize > newSize) + newSize = minNewSize; + + keys = copyOf(keys, newSize); + vals = copyOf(vals, newSize); + } + + // simple implementation of Arrays.copy, for support of Android API 8. + private static String[] copyOf(String[] orig, int size) { + final String[] copy = new String[size]; + System.arraycopy(orig, 0, copy, 0, + Math.min(orig.length, size)); + return copy; + } + + int indexOfKey(String key) { + Validate.notNull(key); + for (int i = 0; i < size; i++) { + if (key.equals(keys[i])) + return i; + } + return NotFound; + } + + private int indexOfKeyIgnoreCase(String key) { + Validate.notNull(key); + for (int i = 0; i < size; i++) { + if (key.equalsIgnoreCase(keys[i])) + return i; + } + return NotFound; + } + + // we track boolean attributes as null in values - they're just keys. so returns empty for consumers + static String checkNotNull(String val) { + return val == null ? EmptyString : val; + } + + /** + Get an attribute value by key. + @param key the (case-sensitive) attribute key + @return the attribute value if set; or empty string if not set (or a boolean attribute). + @see #hasKey(String) + */ + public String get(String key) { + int i = indexOfKey(key); + return i == NotFound ? EmptyString : checkNotNull(vals[i]); + } + + /** + * Get an attribute's value by case-insensitive key + * @param key the attribute name + * @return the first matching attribute value if set; or empty string if not set (ora boolean attribute). + */ + public String getIgnoreCase(String key) { + int i = indexOfKeyIgnoreCase(key); + return i == NotFound ? EmptyString : checkNotNull(vals[i]); + } + + // adds without checking if this key exists + private void add(String key, String value) { + checkCapacity(size + 1); + keys[size] = key; + vals[size] = value; + size++; + } + + /** + * Set a new attribute, or replace an existing one by key. + * @param key case sensitive attribute key + * @param value attribute value + * @return these attributes, for chaining + */ + public Attributes put(String key, String value) { + int i = indexOfKey(key); + if (i != NotFound) + vals[i] = value; + else + add(key, value); + return this; + } + + void putIgnoreCase(String key, String value) { + int i = indexOfKeyIgnoreCase(key); + if (i != NotFound) { + vals[i] = value; + if (!keys[i].equals(key)) // case changed, update + keys[i] = key; + } + else + add(key, value); + } + + /** + * Set a new boolean attribute, remove attribute if value is false. + * @param key case insensitive attribute key + * @param value attribute value + * @return these attributes, for chaining + */ + public Attributes put(String key, boolean value) { + if (value) + putIgnoreCase(key, null); + else + remove(key); + return this; + } + + /** + Set a new attribute, or replace an existing one by key. + @param attribute attribute with case sensitive key + @return these attributes, for chaining + */ + public Attributes put(Attribute attribute) { + Validate.notNull(attribute); + put(attribute.getKey(), attribute.getValue()); + attribute.parent = this; + return this; + } + + // removes and shifts up + private void remove(int index) { + Validate.isFalse(index >= size); + int shifted = size - index - 1; + if (shifted > 0) { + System.arraycopy(keys, index + 1, keys, index, shifted); + System.arraycopy(vals, index + 1, vals, index, shifted); + } + size--; + keys[size] = null; // release hold + vals[size] = null; + } + + /** + Remove an attribute by key. Case sensitive. + @param key attribute key to remove + */ + public void remove(String key) { + int i = indexOfKey(key); + if (i != NotFound) + remove(i); + } + + /** + Remove an attribute by key. Case insensitive. + @param key attribute key to remove + */ + public void removeIgnoreCase(String key) { + int i = indexOfKeyIgnoreCase(key); + if (i != NotFound) + remove(i); + } + + /** + Tests if these attributes contain an attribute with this key. + @param key case-sensitive key to check for + @return true if key exists, false otherwise + */ + public boolean hasKey(String key) { + return indexOfKey(key) != NotFound; + } + + /** + Tests if these attributes contain an attribute with this key. + @param key key to check for + @return true if key exists, false otherwise + */ + public boolean hasKeyIgnoreCase(String key) { + return indexOfKeyIgnoreCase(key) != NotFound; + } + + /** + Get the number of attributes in this set. + @return size + */ + public int size() { + return size; + } + + /** + Add all the attributes from the incoming set to this set. + @param incoming attributes to add to these attributes. + */ + public void addAll(Attributes incoming) { + if (incoming.size() == 0) + return; + checkCapacity(size + incoming.size); + + for (Attribute attr : incoming) { + // todo - should this be case insensitive? + put(attr); + } + + } + + public Iterator iterator() { + return new Iterator() { + int i = 0; + + @Override + public boolean hasNext() { + return i < size; + } + + @Override + public Attribute next() { + final Attribute attr = new Attribute(keys[i], vals[i], Attributes.this); + i++; + return attr; + } + + @Override + public void remove() { + Attributes.this.remove(--i); // next() advanced, so rewind + } + }; + } + + /** + Get the attributes as a List, for iteration. + @return an view of the attributes as an unmodifialbe List. + */ + public List asList() { + ArrayList list = new ArrayList<>(size); + for (int i = 0; i < size; i++) { +// Attribute attr = vals[i] == null ? +// new BooleanAttribute(keys[i]) : // deprecated class, but maybe someone still wants it +// new Attribute(keys[i], vals[i], Attributes.this); +// list.add(attr); + list.add(new Attribute(keys[i], vals[i], Attributes.this)); + } + return Collections.unmodifiableList(list); + } + + /** + * Retrieves a filtered view of attributes that are HTML5 custom data attributes; that is, attributes with keys + * starting with {@code data-}. + * @return map of custom data attributes. + */ + public Map dataset() { + return new Dataset(this); + } + +// /** +// Get the HTML representation of these attributes. +// @return HTML +// @throws SerializationException if the HTML representation of the attributes cannot be constructed. +// */ +// public String html() { +// StringBuilder accum = new StringBuilder(); +// try { +// html(accum, (new Document("")).outputSettings()); // output settings a bit funky, but this html() seldom used +// } catch (IOException e) { // ought never happen +// throw new SerializationException(e); +// } +// return accum.toString(); +// } +// +// final void html(final Appendable accum, final Document.OutputSettings out) throws IOException { +// final int sz = size; +// for (int i = 0; i < sz; i++) { +// // inlined from Attribute.html() +// final String key = keys[i]; +// final String val = vals[i]; +// accum.append(' ').append(key); +// +// // collapse checked=null, checked="", checked=checked; write out others +// if (!Attribute.shouldCollapseAttribute(key, val, out)) { +// accum.append("=\""); +// Entities.escape(accum, val == null ? EmptyString : val, out, true, false, false); +// accum.append('"'); +// } +// } +// } +// +// @Override +// public String toString() { +// return html(); +// } + + /** + * Checks if these attributes are equal to another set of attributes, by comparing the two sets + * @param o attributes to compare with + * @return if both sets of attributes have the same content + */ + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Attributes that = (Attributes) o; + + if (size != that.size) return false; + if (!Arrays.equals(keys, that.keys)) return false; + return Arrays.equals(vals, that.vals); + } + + /** + * Calculates the hashcode of these attributes, by iterating all attributes and summing their hashcodes. + * @return calculated hashcode + */ + @Override + public int hashCode() { + int result = size; + result = 31 * result + Arrays.hashCode(keys); + result = 31 * result + Arrays.hashCode(vals); + return result; + } + + @Override + public Attributes clone() { + Attributes clone; + try { + clone = (Attributes) super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + clone.size = size; + keys = copyOf(keys, size); + vals = copyOf(vals, size); + return clone; + } + + /** + * Internal method. Lowercases all keys. + */ + public void normalize() { + for (int i = 0; i < size; i++) { + keys[i] = lowerCase(keys[i]); + } + } + + private static class Dataset extends AbstractMap { + private final Attributes attributes; + + private Dataset(Attributes attributes) { + this.attributes = attributes; + } + + @Override + public Set> entrySet() { + return new EntrySet(); + } + + @Override + public String put(String key, String value) { + String dataKey = dataKey(key); + String oldValue = attributes.hasKey(dataKey) ? attributes.get(dataKey) : null; + attributes.put(dataKey, value); + return oldValue; + } + + private class EntrySet extends AbstractSet> { + + @Override + public Iterator> iterator() { + return new DatasetIterator(); + } + + @Override + public int size() { + int count = 0; + Iterator iter = new DatasetIterator(); + while (iter.hasNext()) + count++; + return count; + } + } + + private class DatasetIterator implements Iterator> { + private Iterator attrIter = attributes.iterator(); + private Attribute attr; + public boolean hasNext() { + while (attrIter.hasNext()) { + attr = attrIter.next(); + if (attr.isDataAttribute()) return true; + } + return false; + } + + public Entry next() { + return new Attribute(attr.getKey().substring(dataPrefix.length()), attr.getValue()); + } + + public void remove() { + attributes.remove(attr.getKey()); + } + } + } + + private static String dataKey(String key) { + return dataPrefix + key; + } +} \ No newline at end of file diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/DocumentType.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/DocumentType.java new file mode 100644 index 00000000..dc11e537 --- /dev/null +++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/DocumentType.java @@ -0,0 +1,104 @@ +package ru.noties.markwon.html.jsoup.nodes; + +/** + * A {@code } node. + */ +public class DocumentType /*extends LeafNode*/ { + // todo needs a bit of a chunky cleanup. this level of detail isn't needed + public static final String PUBLIC_KEY = "PUBLIC"; + public static final String SYSTEM_KEY = "SYSTEM"; +// private static final String NAME = "name"; +// private static final String PUB_SYS_KEY = "pubSysKey"; // PUBLIC or SYSTEM +// private static final String PUBLIC_ID = "publicId"; +// private static final String SYSTEM_ID = "systemId"; + // todo: quirk mode from publicId and systemId + +// /** +// * Create a new doctype element. +// * @param name the doctype's name +// * @param publicId the doctype's public ID +// * @param systemId the doctype's system ID +// */ +// public DocumentType(String name, String publicId, String systemId) { +// Validate.notNull(name); +// Validate.notNull(publicId); +// Validate.notNull(systemId); +// attr(NAME, name); +// attr(PUBLIC_ID, publicId); +// if (has(PUBLIC_ID)) { +// attr(PUB_SYS_KEY, PUBLIC_KEY); +// } +// attr(SYSTEM_ID, systemId); +// } +// +// /** +// * Create a new doctype element. +// * @param name the doctype's name +// * @param publicId the doctype's public ID +// * @param systemId the doctype's system ID +// * @param baseUri unused +// * @deprecated +// */ +// public DocumentType(String name, String publicId, String systemId, String baseUri) { +// attr(NAME, name); +// attr(PUBLIC_ID, publicId); +// if (has(PUBLIC_ID)) { +// attr(PUB_SYS_KEY, PUBLIC_KEY); +// } +// attr(SYSTEM_ID, systemId); +// } +// +// /** +// * Create a new doctype element. +// * @param name the doctype's name +// * @param publicId the doctype's public ID +// * @param systemId the doctype's system ID +// * @param baseUri unused +// * @deprecated +// */ +// public DocumentType(String name, String pubSysKey, String publicId, String systemId, String baseUri) { +// attr(NAME, name); +// if (pubSysKey != null) { +// attr(PUB_SYS_KEY, pubSysKey); +// } +// attr(PUBLIC_ID, publicId); +// attr(SYSTEM_ID, systemId); +// } +// public void setPubSysKey(String value) { +// if (value != null) +// attr(PUB_SYS_KEY, value); +// } +// +// @Override +// public String nodeName() { +// return "#doctype"; +// } +// +// @Override +// void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException { +// if (out.syntax() == Syntax.html && !has(PUBLIC_ID) && !has(SYSTEM_ID)) { +// // looks like a html5 doctype, go lowercase for aesthetics +// accum.append("'); +// } +// +// @Override +// void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) { +// } +// +// private boolean has(final String attribute) { +// return !StringUtil.isBlank(attr(attribute)); +// } +} + diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Entities.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Entities.java new file mode 100644 index 00000000..c6c8d829 --- /dev/null +++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/Entities.java @@ -0,0 +1,351 @@ +package ru.noties.markwon.html.jsoup.nodes; + +import java.nio.charset.CharsetEncoder; +import java.util.Arrays; +import java.util.HashMap; + +import ru.noties.markwon.html.jsoup.helper.Validate; +import ru.noties.markwon.html.jsoup.parser.CharacterReader; + +import static ru.noties.markwon.html.jsoup.nodes.Entities.EscapeMode.base; +import static ru.noties.markwon.html.jsoup.nodes.Entities.EscapeMode.extended; + +/** + * HTML entities, and escape routines. Source: W3C + * HTML named character references. + */ +public class Entities { + private static final int empty = -1; + private static final String emptyName = ""; + static final int codepointRadix = 36; + private static final char[] codeDelims = {',', ';'}; + private static final HashMap multipoints = new HashMap<>(); // name -> multiple character references +// private static final Document.OutputSettings DefaultOutput = new Document.OutputSettings(); + + public enum EscapeMode { + /** + * Restricted entities suitable for XHTML output: lt, gt, amp, and quot only. + */ + xhtml(EntitiesData.xmlPoints, 4), + /** + * Default HTML output entities. + */ + base(EntitiesData.basePoints, 106), + /** + * Complete HTML entities. + */ + extended(EntitiesData.fullPoints, 2125); + + // table of named references to their codepoints. sorted so we can binary search. built by BuildEntities. + private String[] nameKeys; + private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints. + + // table of codepoints to named entities. + private int[] codeKeys; // we don' support multicodepoints to single named value currently + private String[] nameVals; + + EscapeMode(String file, int size) { + load(this, file, size); + } + + int codepointForName(final String name) { + int index = Arrays.binarySearch(nameKeys, name); + return index >= 0 ? codeVals[index] : empty; + } + + String nameForCodepoint(final int codepoint) { + final int index = Arrays.binarySearch(codeKeys, codepoint); + if (index >= 0) { + // the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower + // (and binary search for same item with multi results is undefined + return (index < nameVals.length - 1 && codeKeys[index + 1] == codepoint) ? + nameVals[index + 1] : nameVals[index]; + } + return emptyName; + } + + private int size() { + return nameKeys.length; + } + } + + private Entities() { + } + + /** + * Check if the input is a known named entity + * + * @param name the possible entity name (e.g. "lt" or "amp") + * @return true if a known named entity + */ + public static boolean isNamedEntity(final String name) { + return extended.codepointForName(name) != empty; + } + + /** + * Check if the input is a known named entity in the base entity set. + * + * @param name the possible entity name (e.g. "lt" or "amp") + * @return true if a known named entity in the base set + * @see #isNamedEntity(String) + */ + public static boolean isBaseNamedEntity(final String name) { + return base.codepointForName(name) != empty; + } + + /** + * Get the Character value of the named entity + * + * @param name named entity (e.g. "lt" or "amp") + * @return the Character value of the named entity (e.g. '{@literal <}' or '{@literal &}') + * @deprecated does not support characters outside the BMP or multiple character names + */ + public static Character getCharacterByName(String name) { + return (char) extended.codepointForName(name); + } + + /** + * Get the character(s) represented by the named entity + * + * @param name entity (e.g. "lt" or "amp") + * @return the string value of the character(s) represented by this entity, or "" if not defined + */ + public static String getByName(String name) { + String val = multipoints.get(name); + if (val != null) + return val; + int codepoint = extended.codepointForName(name); + if (codepoint != empty) + return new String(new int[]{codepoint}, 0, 1); + return emptyName; + } + + public static int codepointsForName(final String name, final int[] codepoints) { + String val = multipoints.get(name); + if (val != null) { + codepoints[0] = val.codePointAt(0); + codepoints[1] = val.codePointAt(1); + return 2; + } + int codepoint = extended.codepointForName(name); + if (codepoint != empty) { + codepoints[0] = codepoint; + return 1; + } + return 0; + } + +// /** +// * HTML escape an input string. That is, {@code <} is returned as {@code <} +// * +// * @param string the un-escaped string to escape +// * @param out the output settings to use +// * @return the escaped string +// */ +// public static String escape(String string, Document.OutputSettings out) { +// if (string == null) +// return ""; +// StringBuilder accum = new StringBuilder(string.length() * 2); +// try { +// escape(accum, string, out, false, false, false); +// } catch (IOException e) { +// throw new SerializationException(e); // doesn't happen +// } +// return accum.toString(); +// } + +// /** +// * HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is returned as +// * {@code <} +// * +// * @param string the un-escaped string to escape +// * @return the escaped string +// */ +// public static String escape(String string) { +// return escape(string, DefaultOutput); +// } +// +// // this method is ugly, and does a lot. but other breakups cause rescanning and stringbuilder generations +// static void escape(Appendable accum, String string, Document.OutputSettings out, +// boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite) throws IOException { +// +// boolean lastWasWhite = false; +// boolean reachedNonWhite = false; +// final EscapeMode escapeMode = out.escapeMode(); +// final CharsetEncoder encoder = out.encoder(); +// final CoreCharset coreCharset = out.coreCharset; // init in out.prepareEncoder() +// final int length = string.length(); +// +// int codePoint; +// for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) { +// codePoint = string.codePointAt(offset); +// +// if (normaliseWhite) { +// if (StringUtil.isWhitespace(codePoint)) { +// if ((stripLeadingWhite && !reachedNonWhite) || lastWasWhite) +// continue; +// accum.append(' '); +// lastWasWhite = true; +// continue; +// } else { +// lastWasWhite = false; +// reachedNonWhite = true; +// } +// } +// // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]): +// if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) { +// final char c = (char) codePoint; +// // html specific and required escapes: +// switch (c) { +// case '&': +// accum.append("&"); +// break; +// case 0xA0: +// if (escapeMode != EscapeMode.xhtml) +// accum.append(" "); +// else +// accum.append(" "); +// break; +// case '<': +// // escape when in character data or when in a xml attribue val; not needed in html attr val +// if (!inAttribute || escapeMode == EscapeMode.xhtml) +// accum.append("<"); +// else +// accum.append(c); +// break; +// case '>': +// if (!inAttribute) +// accum.append(">"); +// else +// accum.append(c); +// break; +// case '"': +// if (inAttribute) +// accum.append("""); +// else +// accum.append(c); +// break; +// default: +// if (canEncode(coreCharset, c, encoder)) +// accum.append(c); +// else +// appendEncoded(accum, escapeMode, codePoint); +// } +// } else { +// final String c = new String(Character.toChars(codePoint)); +// if (encoder.canEncode(c)) // uses fallback encoder for simplicity +// accum.append(c); +// else +// appendEncoded(accum, escapeMode, codePoint); +// } +// } +// } + +// private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException { +// final String name = escapeMode.nameForCodepoint(codePoint); +// if (name != emptyName) // ok for identity check +// accum.append('&').append(name).append(';'); +// else +// accum.append("&#x").append(Integer.toHexString(codePoint)).append(';'); +// } + +// /** +// * Un-escape an HTML escaped string. That is, {@code <} is returned as {@code <}. +// * +// * @param string the HTML string to un-escape +// * @return the unescaped string +// */ +// public static String unescape(String string) { +// return unescape(string, false); +// } + +// /** +// * Unescape the input string. +// * +// * @param string to un-HTML-escape +// * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional) +// * @return unescaped string +// */ +// static String unescape(String string, boolean strict) { +// return Parser.unescapeEntities(string, strict); +// } + + /* + * Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean. + * After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF, + * performance may be bad. We can add more encoders for common character sets that are impacted by performance + * issues on Android if required. + * + * Benchmarks: * + * OLD toHtml() impl v New (fastpath) in millis + * Wiki: 1895, 16 + * CNN: 6378, 55 + * Alterslash: 3013, 28 + * Jsoup: 167, 2 + */ + private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) { + // todo add more charset tests if impacted by Android's bad perf in canEncode + switch (charset) { + case ascii: + return c < 0x80; + case utf: + return true; // real is:!(Character.isLowSurrogate(c) || Character.isHighSurrogate(c)); - but already check above + default: + return fallback.canEncode(c); + } + } + + enum CoreCharset { + ascii, utf, fallback; + + static CoreCharset byName(final String name) { + if (name.equals("US-ASCII")) + return ascii; + if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al + return utf; + return fallback; + } + } + + private static void load(EscapeMode e, String pointsData, int size) { + e.nameKeys = new String[size]; + e.codeVals = new int[size]; + e.codeKeys = new int[size]; + e.nameVals = new String[size]; + + int i = 0; + CharacterReader reader = new CharacterReader(pointsData); + + while (!reader.isEmpty()) { + // NotNestedLessLess=10913,824;1887& + + final String name = reader.consumeTo('='); + reader.advance(); + final int cp1 = Integer.parseInt(reader.consumeToAny(codeDelims), codepointRadix); + final char codeDelim = reader.current(); + reader.advance(); + final int cp2; + if (codeDelim == ',') { + cp2 = Integer.parseInt(reader.consumeTo(';'), codepointRadix); + reader.advance(); + } else { + cp2 = empty; + } + final String indexS = reader.consumeTo('&'); + final int index = Integer.parseInt(indexS, codepointRadix); + reader.advance(); + + e.nameKeys[i] = name; + e.codeVals[i] = cp1; + e.codeKeys[index] = cp1; + e.nameVals[index] = name; + + if (cp2 != empty) { + multipoints.put(name, new String(new int[]{cp1, cp2}, 0, 2)); + } + i++; + } + + Validate.isTrue(i == size, "Unexpected count of entities loaded"); + } +} diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/EntitiesData.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/EntitiesData.java new file mode 100644 index 00000000..036c712f --- /dev/null +++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/nodes/EntitiesData.java @@ -0,0 +1,11 @@ +package ru.noties.markwon.html.jsoup.nodes; + +/** + * Holds packed data that represents Entity name=value pairs. Parsed by Entities, created by BuildEntities. + */ +class EntitiesData { + static final String xmlPoints = "amp=12;1>=1q;3<=1o;2"=y;0&"; + static final String basePoints = "AElig=5i;1c&=12;2Á=5d;17Â=5e;18À=5c;16Å=5h;1bÃ=5f;19Ä=5g;1a©=4p;hÇ=5j;1dÐ=5s;1mÉ=5l;1fÊ=5m;1gÈ=5k;1eË=5n;1h>=1q;6Í=5p;1jÎ=5q;1kÌ=5o;1iÏ=5r;1l<=1o;4Ñ=5t;1nÓ=5v;1pÔ=5w;1qÒ=5u;1oØ=60;1uÕ=5x;1rÖ=5y;1s"=y;0®=4u;nÞ=66;20Ú=62;1wÛ=63;1xÙ=61;1vÜ=64;1yÝ=65;1zá=69;23â=6a;24´=50;uæ=6e;28à=68;22&=12;3å=6d;27ã=6b;25ä=6c;26¦=4m;eç=6f;29¸=54;y¢=4i;a©=4p;i¤=4k;c°=4w;q÷=6v;2pé=6h;2bê=6i;2cè=6g;2að=6o;2ië=6j;2d½=59;13¼=58;12¾=5a;14>=1q;7í=6l;2fî=6m;2g¡=4h;9ì=6k;2e¿=5b;15ï=6n;2h«=4r;k<=1o;5¯=4v;pµ=51;v·=53;x =4g;8¬=4s;lñ=6p;2jó=6r;2lô=6s;2mò=6q;2kª=4q;jº=56;10ø=6w;2qõ=6t;2nö=6u;2o¶=52;w±=4x;r£=4j;b"=y;1»=57;11®=4u;o§=4n;f­=4t;m¹=55;z²=4y;s³=4z;tß=67;21þ=72;2w×=5z;1tú=6y;2sû=6z;2tù=6x;2r¨=4o;gü=70;2uý=71;2v¥=4l;dÿ=73;2x&"; + static final String fullPoints = "AElig=5i;2v&=12;8Á=5d;2p&Abreve=76;4kÂ=5e;2q&Acy=sw;av&Afr=2kn8;1khÀ=5c;2o&Alpha=pd;8d&Amacr=74;4i&And=8cz;1e1&Aogon=78;4m&Aopf=2koo;1ls&ApplyFunction=6e9;ewÅ=5h;2t&Ascr=2kkc;1jc&Assign=6s4;s6Ã=5f;2rÄ=5g;2s&Backslash=6qe;o1&Barv=8h3;1it&Barwed=6x2;120&Bcy=sx;aw&Because=6r9;pw&Bernoullis=6jw;gn&Beta=pe;8e&Bfr=2kn9;1ki&Bopf=2kop;1lt&Breve=k8;82&Bscr=6jw;gp&Bumpeq=6ry;ro&CHcy=tj;bi©=4p;1q&Cacute=7a;4o&Cap=6vm;zz&CapitalDifferentialD=6kl;h8&Cayleys=6jx;gq&Ccaron=7g;4uÇ=5j;2w&Ccirc=7c;4q&Cconint=6r4;pn&Cdot=7e;4s&Cedilla=54;2e&CenterDot=53;2b&Cfr=6jx;gr&Chi=pz;8y&CircleDot=6u1;x8&CircleMinus=6ty;x3&CirclePlus=6tx;x1&CircleTimes=6tz;x5&ClockwiseContourIntegral=6r6;pp&CloseCurlyDoubleQuote=6cd;e0&CloseCurlyQuote=6c9;dt&Colon=6rb;q1&Colone=8dw;1en&Congruent=6sh;sn&Conint=6r3;pm&ContourIntegral=6r2;pi&Copf=6iq;f7&Coproduct=6q8;nq&CounterClockwiseContourIntegral=6r7;pr&Cross=8bz;1d8&Cscr=2kke;1jd&Cup=6vn;100&CupCap=6rx;rk&DD=6kl;h9&DDotrahd=841;184&DJcy=si;ai&DScy=sl;al&DZcy=sv;au&Dagger=6ch;e7&Darr=6n5;j5&Dashv=8h0;1ir&Dcaron=7i;4w&Dcy=t0;az&Del=6pz;n9&Delta=pg;8g&Dfr=2knb;1kj&DiacriticalAcute=50;27&DiacriticalDot=k9;84&DiacriticalDoubleAcute=kd;8a&DiacriticalGrave=2o;13&DiacriticalTilde=kc;88&Diamond=6v8;za&DifferentialD=6km;ha&Dopf=2kor;1lu&Dot=4o;1n&DotDot=6ho;f5&DotEqual=6s0;rw&DoubleContourIntegral=6r3;pl&DoubleDot=4o;1m&DoubleDownArrow=6oj;m0&DoubleLeftArrow=6og;lq&DoubleLeftRightArrow=6ok;m3&DoubleLeftTee=8h0;1iq&DoubleLongLeftArrow=7w8;17g&DoubleLongLeftRightArrow=7wa;17m&DoubleLongRightArrow=7w9;17j&DoubleRightArrow=6oi;lw&DoubleRightTee=6ug;xz&DoubleUpArrow=6oh;lt&DoubleUpDownArrow=6ol;m7&DoubleVerticalBar=6qt;ov&DownArrow=6mr;i8&DownArrowBar=843;186&DownArrowUpArrow=6ph;mn&DownBreve=lt;8c&DownLeftRightVector=85s;198&DownLeftTeeVector=866;19m&DownLeftVector=6nx;ke&DownLeftVectorBar=85y;19e&DownRightTeeVector=867;19n&DownRightVector=6o1;kq&DownRightVectorBar=85z;19f&DownTee=6uc;xs&DownTeeArrow=6nb;jh&Downarrow=6oj;m1&Dscr=2kkf;1je&Dstrok=7k;4y&ENG=96;6gÐ=5s;35É=5l;2y&Ecaron=7u;56Ê=5m;2z&Ecy=tp;bo&Edot=7q;52&Efr=2knc;1kkÈ=5k;2x&Element=6q0;na&Emacr=7m;50&EmptySmallSquare=7i3;15x&EmptyVerySmallSquare=7fv;150&Eogon=7s;54&Eopf=2kos;1lv&Epsilon=ph;8h&Equal=8dx;1eo&EqualTilde=6rm;qp&Equilibrium=6oc;li&Escr=6k0;gu&Esim=8dv;1em&Eta=pj;8jË=5n;30&Exists=6pv;mz&ExponentialE=6kn;hc&Fcy=tg;bf&Ffr=2knd;1kl&FilledSmallSquare=7i4;15y&FilledVerySmallSquare=7fu;14w&Fopf=2kot;1lw&ForAll=6ps;ms&Fouriertrf=6k1;gv&Fscr=6k1;gw&GJcy=sj;aj>=1q;r&Gamma=pf;8f&Gammad=rg;a5&Gbreve=7y;5a&Gcedil=82;5e&Gcirc=7w;58&Gcy=sz;ay&Gdot=80;5c&Gfr=2kne;1km&Gg=6vt;10c&Gopf=2kou;1lx&GreaterEqual=6sl;sv&GreaterEqualLess=6vv;10i&GreaterFullEqual=6sn;t6&GreaterGreater=8f6;1gh&GreaterLess=6t3;ul&GreaterSlantEqual=8e6;1f5&GreaterTilde=6sz;ub&Gscr=2kki;1jf&Gt=6sr;tr&HARDcy=tm;bl&Hacek=jr;80&Hat=2m;10&Hcirc=84;5f&Hfr=6j0;fe&HilbertSpace=6iz;fa&Hopf=6j1;fg&HorizontalLine=7b4;13i&Hscr=6iz;fc&Hstrok=86;5h&HumpDownHump=6ry;rn&HumpEqual=6rz;rs&IEcy=t1;b0&IJlig=8i;5s&IOcy=sh;ahÍ=5p;32Î=5q;33&Icy=t4;b3&Idot=8g;5p&Ifr=6j5;fqÌ=5o;31&Im=6j5;fr&Imacr=8a;5l&ImaginaryI=6ko;hf&Implies=6oi;ly&Int=6r0;pf&Integral=6qz;pd&Intersection=6v6;z4&InvisibleComma=6eb;f0&InvisibleTimes=6ea;ey&Iogon=8e;5n&Iopf=2kow;1ly&Iota=pl;8l&Iscr=6j4;fn&Itilde=88;5j&Iukcy=sm;amÏ=5r;34&Jcirc=8k;5u&Jcy=t5;b4&Jfr=2knh;1kn&Jopf=2kox;1lz&Jscr=2kkl;1jg&Jsercy=so;ao&Jukcy=sk;ak&KHcy=th;bg&KJcy=ss;as&Kappa=pm;8m&Kcedil=8m;5w&Kcy=t6;b5&Kfr=2kni;1ko&Kopf=2koy;1m0&Kscr=2kkm;1jh&LJcy=sp;ap<=1o;m&Lacute=8p;5z&Lambda=pn;8n&Lang=7vu;173&Laplacetrf=6j6;fs&Larr=6n2;j1&Lcaron=8t;63&Lcedil=8r;61&Lcy=t7;b6&LeftAngleBracket=7vs;16x&LeftArrow=6mo;hu&LeftArrowBar=6p0;mj&LeftArrowRightArrow=6o6;l3&LeftCeiling=6x4;121&LeftDoubleBracket=7vq;16t&LeftDownTeeVector=869;19p&LeftDownVector=6o3;kw&LeftDownVectorBar=861;19h&LeftFloor=6x6;125&LeftRightArrow=6ms;ib&LeftRightVector=85q;196&LeftTee=6ub;xq&LeftTeeArrow=6n8;ja&LeftTeeVector=862;19i&LeftTriangle=6uq;ya&LeftTriangleBar=89b;1c0&LeftTriangleEqual=6us;yg&LeftUpDownVector=85t;199&LeftUpTeeVector=868;19o&LeftUpVector=6nz;kk&LeftUpVectorBar=860;19g&LeftVector=6nw;kb&LeftVectorBar=85u;19a&Leftarrow=6og;lr&Leftrightarrow=6ok;m4&LessEqualGreater=6vu;10e&LessFullEqual=6sm;t0&LessGreater=6t2;ui&LessLess=8f5;1gf&LessSlantEqual=8e5;1ez&LessTilde=6sy;u8&Lfr=2knj;1kp&Ll=6vs;109&Lleftarrow=6oq;me&Lmidot=8v;65&LongLeftArrow=7w5;177&LongLeftRightArrow=7w7;17d&LongRightArrow=7w6;17a&Longleftarrow=7w8;17h&Longleftrightarrow=7wa;17n&Longrightarrow=7w9;17k&Lopf=2koz;1m1&LowerLeftArrow=6mx;iq&LowerRightArrow=6mw;in&Lscr=6j6;fu&Lsh=6nk;jv&Lstrok=8x;67&Lt=6sq;tl&Map=83p;17v&Mcy=t8;b7&MediumSpace=6e7;eu&Mellintrf=6k3;gx&Mfr=2knk;1kq&MinusPlus=6qb;nv&Mopf=2kp0;1m2&Mscr=6k3;gz&Mu=po;8o&NJcy=sq;aq&Nacute=8z;69&Ncaron=93;6d&Ncedil=91;6b&Ncy=t9;b8&NegativeMediumSpace=6bv;dc&NegativeThickSpace=6bv;dd&NegativeThinSpace=6bv;de&NegativeVeryThinSpace=6bv;db&NestedGreaterGreater=6sr;tq&NestedLessLess=6sq;tk&NewLine=a;1&Nfr=2knl;1kr&NoBreak=6e8;ev&NonBreakingSpace=4g;1d&Nopf=6j9;fx&Not=8h8;1ix&NotCongruent=6si;sp&NotCupCap=6st;tv&NotDoubleVerticalBar=6qu;p0&NotElement=6q1;ne&NotEqual=6sg;sk&NotEqualTilde=6rm,mw;qn&NotExists=6pw;n1&NotGreater=6sv;tz&NotGreaterEqual=6sx;u5&NotGreaterFullEqual=6sn,mw;t3&NotGreaterGreater=6sr,mw;tn&NotGreaterLess=6t5;uq&NotGreaterSlantEqual=8e6,mw;1f2&NotGreaterTilde=6t1;ug&NotHumpDownHump=6ry,mw;rl&NotHumpEqual=6rz,mw;rq&NotLeftTriangle=6wa;113&NotLeftTriangleBar=89b,mw;1bz&NotLeftTriangleEqual=6wc;119&NotLess=6su;tw&NotLessEqual=6sw;u2&NotLessGreater=6t4;uo&NotLessLess=6sq,mw;th&NotLessSlantEqual=8e5,mw;1ew&NotLessTilde=6t0;ue&NotNestedGreaterGreater=8f6,mw;1gg&NotNestedLessLess=8f5,mw;1ge&NotPrecedes=6tc;vb&NotPrecedesEqual=8fj,mw;1gv&NotPrecedesSlantEqual=6w0;10p&NotReverseElement=6q4;nl&NotRightTriangle=6wb;116&NotRightTriangleBar=89c,mw;1c1&NotRightTriangleEqual=6wd;11c&NotSquareSubset=6tr,mw;wh&NotSquareSubsetEqual=6w2;10t&NotSquareSuperset=6ts,mw;wl&NotSquareSupersetEqual=6w3;10v&NotSubset=6te,6he;vh&NotSubsetEqual=6tk;w0&NotSucceeds=6td;ve&NotSucceedsEqual=8fk,mw;1h1&NotSucceedsSlantEqual=6w1;10r&NotSucceedsTilde=6tb,mw;v7&NotSuperset=6tf,6he;vm&NotSupersetEqual=6tl;w3&NotTilde=6rl;ql&NotTildeEqual=6ro;qv&NotTildeFullEqual=6rr;r1&NotTildeTilde=6rt;r9&NotVerticalBar=6qs;or&Nscr=2kkp;1jiÑ=5t;36&Nu=pp;8p&OElig=9e;6mÓ=5v;38Ô=5w;39&Ocy=ta;b9&Odblac=9c;6k&Ofr=2knm;1ksÒ=5u;37&Omacr=98;6i&Omega=q1;90&Omicron=pr;8r&Oopf=2kp2;1m3&OpenCurlyDoubleQuote=6cc;dy&OpenCurlyQuote=6c8;dr&Or=8d0;1e2&Oscr=2kkq;1jjØ=60;3dÕ=5x;3a&Otimes=8c7;1dfÖ=5y;3b&OverBar=6da;em&OverBrace=732;13b&OverBracket=71w;134&OverParenthesis=730;139&PartialD=6pu;mx&Pcy=tb;ba&Pfr=2knn;1kt&Phi=py;8x&Pi=ps;8s&PlusMinus=4x;22&Poincareplane=6j0;fd&Popf=6jd;g3&Pr=8fv;1hl&Precedes=6t6;us&PrecedesEqual=8fj;1gy&PrecedesSlantEqual=6t8;uy&PrecedesTilde=6ta;v4&Prime=6cz;eg&Product=6q7;no&Proportion=6rb;q0&Proportional=6ql;oa&Pscr=2kkr;1jk&Psi=q0;8z"=y;3&Qfr=2kno;1ku&Qopf=6je;g5&Qscr=2kks;1jl&RBarr=840;183®=4u;1x&Racute=9g;6o&Rang=7vv;174&Rarr=6n4;j4&Rarrtl=846;187&Rcaron=9k;6s&Rcedil=9i;6q&Rcy=tc;bb&Re=6jg;gb&ReverseElement=6q3;nh&ReverseEquilibrium=6ob;le&ReverseUpEquilibrium=86n;1a4&Rfr=6jg;ga&Rho=pt;8t&RightAngleBracket=7vt;170&RightArrow=6mq;i3&RightArrowBar=6p1;ml&RightArrowLeftArrow=6o4;ky&RightCeiling=6x5;123&RightDoubleBracket=7vr;16v&RightDownTeeVector=865;19l&RightDownVector=6o2;kt&RightDownVectorBar=85x;19d&RightFloor=6x7;127&RightTee=6ua;xo&RightTeeArrow=6na;je&RightTeeVector=863;19j&RightTriangle=6ur;yd&RightTriangleBar=89c;1c2&RightTriangleEqual=6ut;yk&RightUpDownVector=85r;197&RightUpTeeVector=864;19k&RightUpVector=6ny;kh&RightUpVectorBar=85w;19c&RightVector=6o0;kn&RightVectorBar=85v;19b&Rightarrow=6oi;lx&Ropf=6jh;gd&RoundImplies=86o;1a6&Rrightarrow=6or;mg&Rscr=6jf;g7&Rsh=6nl;jx&RuleDelayed=8ac;1cb&SHCHcy=tl;bk&SHcy=tk;bj&SOFTcy=to;bn&Sacute=9m;6u&Sc=8fw;1hm&Scaron=9s;70&Scedil=9q;6y&Scirc=9o;6w&Scy=td;bc&Sfr=2knq;1kv&ShortDownArrow=6mr;i7&ShortLeftArrow=6mo;ht&ShortRightArrow=6mq;i2&ShortUpArrow=6mp;hy&Sigma=pv;8u&SmallCircle=6qg;o6&Sopf=2kp6;1m4&Sqrt=6qi;o9&Square=7fl;14t&SquareIntersection=6tv;ww&SquareSubset=6tr;wi&SquareSubsetEqual=6tt;wp&SquareSuperset=6ts;wm&SquareSupersetEqual=6tu;ws&SquareUnion=6tw;wz&Sscr=2kku;1jm&Star=6va;zf&Sub=6vk;zw&Subset=6vk;zv&SubsetEqual=6ti;vu&Succeeds=6t7;uv&SucceedsEqual=8fk;1h4&SucceedsSlantEqual=6t9;v1&SucceedsTilde=6tb;v8&SuchThat=6q3;ni&Sum=6q9;ns&Sup=6vl;zy&Superset=6tf;vp&SupersetEqual=6tj;vx&Supset=6vl;zxÞ=66;3j&TRADE=6jm;gf&TSHcy=sr;ar&TScy=ti;bh&Tab=9;0&Tau=pw;8v&Tcaron=9w;74&Tcedil=9u;72&Tcy=te;bd&Tfr=2knr;1kw&Therefore=6r8;pt&Theta=pk;8k&ThickSpace=6e7,6bu;et&ThinSpace=6bt;d7&Tilde=6rg;q9&TildeEqual=6rn;qs&TildeFullEqual=6rp;qy&TildeTilde=6rs;r4&Topf=2kp7;1m5&TripleDot=6hn;f3&Tscr=2kkv;1jn&Tstrok=9y;76Ú=62;3f&Uarr=6n3;j2&Uarrocir=85l;193&Ubrcy=su;at&Ubreve=a4;7cÛ=63;3g&Ucy=tf;be&Udblac=a8;7g&Ufr=2kns;1kxÙ=61;3e&Umacr=a2;7a&UnderBar=2n;11&UnderBrace=733;13c&UnderBracket=71x;136&UnderParenthesis=731;13a&Union=6v7;z8&UnionPlus=6tq;wf&Uogon=aa;7i&Uopf=2kp8;1m6&UpArrow=6mp;hz&UpArrowBar=842;185&UpArrowDownArrow=6o5;l1&UpDownArrow=6mt;ie&UpEquilibrium=86m;1a2&UpTee=6ud;xv&UpTeeArrow=6n9;jc&Uparrow=6oh;lu&Updownarrow=6ol;m8&UpperLeftArrow=6mu;ih&UpperRightArrow=6mv;ik&Upsi=r6;9z&Upsilon=px;8w&Uring=a6;7e&Uscr=2kkw;1jo&Utilde=a0;78Ü=64;3h&VDash=6uj;y3&Vbar=8h7;1iw&Vcy=sy;ax&Vdash=6uh;y1&Vdashl=8h2;1is&Vee=6v5;z3&Verbar=6c6;dp&Vert=6c6;dq&VerticalBar=6qr;on&VerticalLine=3g;18&VerticalSeparator=7rs;16o&VerticalTilde=6rk;qi&VeryThinSpace=6bu;d9&Vfr=2knt;1ky&Vopf=2kp9;1m7&Vscr=2kkx;1jp&Vvdash=6ui;y2&Wcirc=ac;7k&Wedge=6v4;z0&Wfr=2knu;1kz&Wopf=2kpa;1m8&Wscr=2kky;1jq&Xfr=2knv;1l0&Xi=pq;8q&Xopf=2kpb;1m9&Xscr=2kkz;1jr&YAcy=tr;bq&YIcy=sn;an&YUcy=tq;bpÝ=65;3i&Ycirc=ae;7m&Ycy=tn;bm&Yfr=2knw;1l1&Yopf=2kpc;1ma&Yscr=2kl0;1js&Yuml=ag;7o&ZHcy=t2;b1&Zacute=ah;7p&Zcaron=al;7t&Zcy=t3;b2&Zdot=aj;7r&ZeroWidthSpace=6bv;df&Zeta=pi;8i&Zfr=6js;gl&Zopf=6jo;gi&Zscr=2kl1;1jtá=69;3m&abreve=77;4l&ac=6ri;qg&acE=6ri,mr;qe&acd=6rj;qhâ=6a;3n´=50;28&acy=ts;bræ=6e;3r&af=6e9;ex&afr=2kny;1l2à=68;3l&alefsym=6k5;h3&aleph=6k5;h4&alpha=q9;92&amacr=75;4j&amalg=8cf;1dm&=12;9&and=6qv;p6&andand=8d1;1e3&andd=8d8;1e9&andslope=8d4;1e6&andv=8d6;1e7&ang=6qo;oj&ange=884;1b1&angle=6qo;oi&angmsd=6qp;ol&angmsdaa=888;1b5&angmsdab=889;1b6&angmsdac=88a;1b7&angmsdad=88b;1b8&angmsdae=88c;1b9&angmsdaf=88d;1ba&angmsdag=88e;1bb&angmsdah=88f;1bc&angrt=6qn;og&angrtvb=6v2;yw&angrtvbd=87x;1b0&angsph=6qq;om&angst=5h;2u&angzarr=70c;12z&aogon=79;4n&aopf=2kpe;1mb&ap=6rs;r8&apE=8ds;1ej&apacir=8dr;1eh&ape=6ru;rd&apid=6rv;rf&apos=13;a&approx=6rs;r5&approxeq=6ru;rcå=6d;3q&ascr=2kl2;1ju&ast=16;e&asymp=6rs;r6&asympeq=6rx;rjã=6b;3oä=6c;3p&awconint=6r7;ps&awint=8b5;1cr&bNot=8h9;1iy&backcong=6rw;rg&backepsilon=s6;af&backprime=6d1;ei&backsim=6rh;qc&backsimeq=6vh;zp&barvee=6v1;yv&barwed=6x1;11y&barwedge=6x1;11x&bbrk=71x;137&bbrktbrk=71y;138&bcong=6rw;rh&bcy=tt;bs&bdquo=6ce;e4&becaus=6r9;py&because=6r9;px&bemptyv=88g;1bd&bepsi=s6;ag&bernou=6jw;go&beta=qa;93&beth=6k6;h5&between=6ss;tt&bfr=2knz;1l3&bigcap=6v6;z5&bigcirc=7hr;15s&bigcup=6v7;z7&bigodot=8ao;1cd&bigoplus=8ap;1cf&bigotimes=8aq;1ch&bigsqcup=8au;1cl&bigstar=7id;15z&bigtriangledown=7gd;15e&bigtriangleup=7g3;154&biguplus=8as;1cj&bigvee=6v5;z1&bigwedge=6v4;yy&bkarow=83x;17x&blacklozenge=8a3;1c9&blacksquare=7fu;14x&blacktriangle=7g4;156&blacktriangledown=7ge;15g&blacktriangleleft=7gi;15k&blacktriangleright=7g8;15a&blank=74z;13f&blk12=7f6;14r&blk14=7f5;14q&blk34=7f7;14s&block=7ew;14p&bne=1p,6hx;o&bnequiv=6sh,6hx;sm&bnot=6xc;12d&bopf=2kpf;1mc&bot=6ud;xx&bottom=6ud;xu&bowtie=6vc;zi&boxDL=7dj;141&boxDR=7dg;13y&boxDl=7di;140&boxDr=7df;13x&boxH=7dc;13u&boxHD=7dy;14g&boxHU=7e1;14j&boxHd=7dw;14e&boxHu=7dz;14h&boxUL=7dp;147&boxUR=7dm;144&boxUl=7do;146&boxUr=7dl;143&boxV=7dd;13v&boxVH=7e4;14m&boxVL=7dv;14d&boxVR=7ds;14a&boxVh=7e3;14l&boxVl=7du;14c&boxVr=7dr;149&boxbox=895;1bw&boxdL=7dh;13z&boxdR=7de;13w&boxdl=7bk;13m&boxdr=7bg;13l&boxh=7b4;13j&boxhD=7dx;14f&boxhU=7e0;14i&boxhd=7cc;13r&boxhu=7ck;13s&boxminus=6u7;xi&boxplus=6u6;xg&boxtimes=6u8;xk&boxuL=7dn;145&boxuR=7dk;142&boxul=7bs;13o&boxur=7bo;13n&boxv=7b6;13k&boxvH=7e2;14k&boxvL=7dt;14b&boxvR=7dq;148&boxvh=7cs;13t&boxvl=7c4;13q&boxvr=7bw;13p&bprime=6d1;ej&breve=k8;83¦=4m;1k&bscr=2kl3;1jv&bsemi=6dr;er&bsim=6rh;qd&bsime=6vh;zq&bsol=2k;x&bsolb=891;1bv&bsolhsub=7uw;16r&bull=6ci;e9&bullet=6ci;e8&bump=6ry;rp&bumpE=8fi;1gu&bumpe=6rz;ru&bumpeq=6rz;rt&cacute=7b;4p&cap=6qx;pa&capand=8ck;1dq&capbrcup=8cp;1dv&capcap=8cr;1dx&capcup=8cn;1dt&capdot=8cg;1dn&caps=6qx,1e68;p9&caret=6dd;eo&caron=jr;81&ccaps=8ct;1dz&ccaron=7h;4vç=6f;3s&ccirc=7d;4r&ccups=8cs;1dy&ccupssm=8cw;1e0&cdot=7f;4t¸=54;2f&cemptyv=88i;1bf¢=4i;1g¢erdot=53;2c&cfr=2ko0;1l4&chcy=uf;ce&check=7pv;16j&checkmark=7pv;16i&chi=qv;9s&cir=7gr;15q&cirE=88z;1bt&circ=jq;7z&circeq=6s7;sc&circlearrowleft=6nu;k6&circlearrowright=6nv;k8&circledR=4u;1w&circledS=79k;13g&circledast=6u3;xc&circledcirc=6u2;xa&circleddash=6u5;xe&cire=6s7;sd&cirfnint=8b4;1cq&cirmid=8hb;1j0&cirscir=88y;1bs&clubs=7kz;168&clubsuit=7kz;167&colon=1m;j&colone=6s4;s7&coloneq=6s4;s5&comma=18;g&commat=1s;u&comp=6pt;mv&compfn=6qg;o7&complement=6pt;mu&complexes=6iq;f6&cong=6rp;qz&congdot=8dp;1ef&conint=6r2;pj&copf=2kpg;1md&coprod=6q8;nr©=4p;1r©sr=6jb;fz&crarr=6np;k1&cross=7pz;16k&cscr=2kl4;1jw&csub=8gf;1id&csube=8gh;1if&csup=8gg;1ie&csupe=8gi;1ig&ctdot=6wf;11g&cudarrl=854;18x&cudarrr=851;18u&cuepr=6vy;10m&cuesc=6vz;10o&cularr=6nq;k3&cularrp=859;190&cup=6qy;pc&cupbrcap=8co;1du&cupcap=8cm;1ds&cupcup=8cq;1dw&cupdot=6tp;we&cupor=8cl;1dr&cups=6qy,1e68;pb&curarr=6nr;k5&curarrm=858;18z&curlyeqprec=6vy;10l&curlyeqsucc=6vz;10n&curlyvee=6vi;zr&curlywedge=6vj;zt¤=4k;1i&curvearrowleft=6nq;k2&curvearrowright=6nr;k4&cuvee=6vi;zs&cuwed=6vj;zu&cwconint=6r6;pq&cwint=6r5;po&cylcty=6y5;12u&dArr=6oj;m2&dHar=86d;19t&dagger=6cg;e5&daleth=6k8;h7&darr=6mr;ia&dash=6c0;dl&dashv=6ub;xr&dbkarow=83z;180&dblac=kd;8b&dcaron=7j;4x&dcy=tw;bv&dd=6km;hb&ddagger=6ch;e6&ddarr=6oa;ld&ddotseq=8dz;1ep°=4w;21&delta=qc;95&demptyv=88h;1be&dfisht=873;1aj&dfr=2ko1;1l5&dharl=6o3;kx&dharr=6o2;ku&diam=6v8;zc&diamond=6v8;zb&diamondsuit=7l2;16b&diams=7l2;16c&die=4o;1o&digamma=rh;a6&disin=6wi;11j&div=6v;49÷=6v;48÷ontimes=6vb;zg&divonx=6vb;zh&djcy=uq;co&dlcorn=6xq;12n&dlcrop=6x9;12a&dollar=10;6&dopf=2kph;1me&dot=k9;85&doteq=6s0;rx&doteqdot=6s1;rz&dotminus=6rc;q2&dotplus=6qc;ny&dotsquare=6u9;xm&doublebarwedge=6x2;11z&downarrow=6mr;i9&downdownarrows=6oa;lc&downharpoonleft=6o3;kv&downharpoonright=6o2;ks&drbkarow=840;182&drcorn=6xr;12p&drcrop=6x8;129&dscr=2kl5;1jx&dscy=ut;cr&dsol=8ae;1cc&dstrok=7l;4z&dtdot=6wh;11i&dtri=7gf;15j&dtrif=7ge;15h&duarr=6ph;mo&duhar=86n;1a5&dwangle=886;1b3&dzcy=v3;d0&dzigrarr=7wf;17r&eDDot=8dz;1eq&eDot=6s1;s0é=6h;3u&easter=8dq;1eg&ecaron=7v;57&ecir=6s6;sbê=6i;3v&ecolon=6s5;s9&ecy=ul;ck&edot=7r;53&ee=6kn;he&efDot=6s2;s2&efr=2ko2;1l6&eg=8ey;1g9è=6g;3t&egs=8eu;1g5&egsdot=8ew;1g7&el=8ex;1g8&elinters=73b;13e&ell=6j7;fv&els=8et;1g3&elsdot=8ev;1g6&emacr=7n;51&empty=6px;n7&emptyset=6px;n5&emptyv=6px;n6&emsp=6bn;d2&emsp13=6bo;d3&emsp14=6bp;d4&eng=97;6h&ensp=6bm;d1&eogon=7t;55&eopf=2kpi;1mf&epar=6vp;103&eparsl=89v;1c6&eplus=8dt;1ek&epsi=qd;97&epsilon=qd;96&epsiv=s5;ae&eqcirc=6s6;sa&eqcolon=6s5;s8&eqsim=6rm;qq&eqslantgtr=8eu;1g4&eqslantless=8et;1g2&equals=1p;p&equest=6sf;sj&equiv=6sh;so&equivDD=8e0;1er&eqvparsl=89x;1c8&erDot=6s3;s4&erarr=86p;1a7&escr=6jz;gs&esdot=6s0;ry&esim=6rm;qr&eta=qf;99ð=6o;41ë=6j;3w&euro=6gc;f2&excl=x;2&exist=6pv;n0&expectation=6k0;gt&exponentiale=6kn;hd&fallingdotseq=6s2;s1&fcy=uc;cb&female=7k0;163&ffilig=1dkz;1ja&fflig=1dkw;1j7&ffllig=1dl0;1jb&ffr=2ko3;1l7&filig=1dkx;1j8&fjlig=2u,2y;15&flat=7l9;16e&fllig=1dky;1j9&fltns=7g1;153&fnof=b6;7v&fopf=2kpj;1mg&forall=6ps;mt&fork=6vo;102&forkv=8gp;1in&fpartint=8b1;1cp½=59;2k&frac13=6kz;hh¼=58;2j&frac15=6l1;hj&frac16=6l5;hn&frac18=6l7;hp&frac23=6l0;hi&frac25=6l2;hk¾=5a;2m&frac35=6l3;hl&frac38=6l8;hq&frac45=6l4;hm&frac56=6l6;ho&frac58=6l9;hr&frac78=6la;hs&frasl=6dg;eq&frown=6xu;12r&fscr=2kl7;1jy&gE=6sn;t8&gEl=8ek;1ft&gacute=dx;7x&gamma=qb;94&gammad=rh;a7&gap=8ee;1fh&gbreve=7z;5b&gcirc=7x;59&gcy=tv;bu&gdot=81;5d&ge=6sl;sx&gel=6vv;10k&geq=6sl;sw&geqq=6sn;t7&geqslant=8e6;1f6&ges=8e6;1f7&gescc=8fd;1gn&gesdot=8e8;1f9&gesdoto=8ea;1fb&gesdotol=8ec;1fd&gesl=6vv,1e68;10h&gesles=8es;1g1&gfr=2ko4;1l8&gg=6sr;ts&ggg=6vt;10b&gimel=6k7;h6&gjcy=ur;cp&gl=6t3;un&glE=8eq;1fz&gla=8f9;1gj&glj=8f8;1gi&gnE=6sp;tg&gnap=8ei;1fp&gnapprox=8ei;1fo&gne=8eg;1fl&gneq=8eg;1fk&gneqq=6sp;tf&gnsim=6w7;10y&gopf=2kpk;1mh&grave=2o;14&gscr=6iy;f9&gsim=6sz;ud&gsime=8em;1fv&gsiml=8eo;1fx>=1q;s>cc=8fb;1gl>cir=8e2;1et>dot=6vr;107>lPar=87p;1aw>quest=8e4;1ev>rapprox=8ee;1fg>rarr=86w;1ad>rdot=6vr;106>reqless=6vv;10j>reqqless=8ek;1fs>rless=6t3;um>rsim=6sz;uc&gvertneqq=6sp,1e68;td&gvnE=6sp,1e68;te&hArr=6ok;m5&hairsp=6bu;da&half=59;2l&hamilt=6iz;fb&hardcy=ui;ch&harr=6ms;id&harrcir=85k;192&harrw=6nh;js&hbar=6j3;fl&hcirc=85;5g&hearts=7l1;16a&heartsuit=7l1;169&hellip=6cm;eb&hercon=6ux;yr&hfr=2ko5;1l9&hksearow=84l;18i&hkswarow=84m;18k&hoarr=6pr;mr&homtht=6rf;q5&hookleftarrow=6nd;jj&hookrightarrow=6ne;jl&hopf=2kpl;1mi&horbar=6c5;do&hscr=2kl9;1jz&hslash=6j3;fi&hstrok=87;5i&hybull=6df;ep&hyphen=6c0;dkí=6l;3y&ic=6eb;f1î=6m;3z&icy=u0;bz&iecy=tx;bw¡=4h;1f&iff=6ok;m6&ifr=2ko6;1laì=6k;3x&ii=6ko;hg&iiiint=8b0;1cn&iiint=6r1;pg&iinfin=89o;1c3&iiota=6jt;gm&ijlig=8j;5t&imacr=8b;5m&image=6j5;fp&imagline=6j4;fm&imagpart=6j5;fo&imath=8h;5r&imof=6uv;yo&imped=c5;7w&in=6q0;nd&incare=6it;f8&infin=6qm;of&infintie=89p;1c4&inodot=8h;5q&int=6qz;pe&intcal=6uy;yt&integers=6jo;gh&intercal=6uy;ys&intlarhk=8bb;1cx&intprod=8cc;1dk&iocy=up;cn&iogon=8f;5o&iopf=2kpm;1mj&iota=qh;9b&iprod=8cc;1dl¿=5b;2n&iscr=2kla;1k0&isin=6q0;nc&isinE=6wp;11r&isindot=6wl;11n&isins=6wk;11l&isinsv=6wj;11k&isinv=6q0;nb&it=6ea;ez&itilde=89;5k&iukcy=uu;csï=6n;40&jcirc=8l;5v&jcy=u1;c0&jfr=2ko7;1lb&jmath=fr;7y&jopf=2kpn;1mk&jscr=2klb;1k1&jsercy=uw;cu&jukcy=us;cq&kappa=qi;9c&kappav=s0;a9&kcedil=8n;5x&kcy=u2;c1&kfr=2ko8;1lc&kgreen=8o;5y&khcy=ud;cc&kjcy=v0;cy&kopf=2kpo;1ml&kscr=2klc;1k2&lAarr=6oq;mf&lArr=6og;ls&lAtail=84b;18a&lBarr=83y;17z&lE=6sm;t2&lEg=8ej;1fr&lHar=86a;19q&lacute=8q;60&laemptyv=88k;1bh&lagran=6j6;ft&lambda=qj;9d&lang=7vs;16z&langd=87l;1as&langle=7vs;16y&lap=8ed;1ff«=4r;1t&larr=6mo;hx&larrb=6p0;mk&larrbfs=84f;18e&larrfs=84d;18c&larrhk=6nd;jk&larrlp=6nf;jo&larrpl=855;18y&larrsim=86r;1a9&larrtl=6n6;j7&lat=8ff;1gp&latail=849;188&late=8fh;1gt&lates=8fh,1e68;1gs&lbarr=83w;17w&lbbrk=7si;16p&lbrace=3f;16&lbrack=2j;v&lbrke=87f;1am&lbrksld=87j;1aq&lbrkslu=87h;1ao&lcaron=8u;64&lcedil=8s;62&lceil=6x4;122&lcub=3f;17&lcy=u3;c2&ldca=852;18v&ldquo=6cc;dz&ldquor=6ce;e3&ldrdhar=86f;19v&ldrushar=85n;195&ldsh=6nm;jz&le=6sk;st&leftarrow=6mo;hv&leftarrowtail=6n6;j6&leftharpoondown=6nx;kd&leftharpoonup=6nw;ka&leftleftarrows=6o7;l6&leftrightarrow=6ms;ic&leftrightarrows=6o6;l4&leftrightharpoons=6ob;lf&leftrightsquigarrow=6nh;jr&leftthreetimes=6vf;zl&leg=6vu;10g&leq=6sk;ss&leqq=6sm;t1&leqslant=8e5;1f0&les=8e5;1f1&lescc=8fc;1gm&lesdot=8e7;1f8&lesdoto=8e9;1fa&lesdotor=8eb;1fc&lesg=6vu,1e68;10d&lesges=8er;1g0&lessapprox=8ed;1fe&lessdot=6vq;104&lesseqgtr=6vu;10f&lesseqqgtr=8ej;1fq&lessgtr=6t2;uj&lesssim=6sy;u9&lfisht=870;1ag&lfloor=6x6;126&lfr=2ko9;1ld&lg=6t2;uk&lgE=8ep;1fy&lhard=6nx;kf&lharu=6nw;kc&lharul=86i;19y&lhblk=7es;14o&ljcy=ux;cv&ll=6sq;tm&llarr=6o7;l7&llcorner=6xq;12m&llhard=86j;19z&lltri=7i2;15w&lmidot=8w;66&lmoust=71s;131&lmoustache=71s;130&lnE=6so;tc&lnap=8eh;1fn&lnapprox=8eh;1fm&lne=8ef;1fj&lneq=8ef;1fi&lneqq=6so;tb&lnsim=6w6;10x&loang=7vw;175&loarr=6pp;mp&lobrk=7vq;16u&longleftarrow=7w5;178&longleftrightarrow=7w7;17e&longmapsto=7wc;17p&longrightarrow=7w6;17b&looparrowleft=6nf;jn&looparrowright=6ng;jp&lopar=879;1ak&lopf=2kpp;1mm&loplus=8bx;1d6&lotimes=8c4;1dc&lowast=6qf;o5&lowbar=2n;12&loz=7gq;15p&lozenge=7gq;15o&lozf=8a3;1ca&lpar=14;b&lparlt=87n;1au&lrarr=6o6;l5&lrcorner=6xr;12o&lrhar=6ob;lg&lrhard=86l;1a1&lrm=6by;di&lrtri=6v3;yx&lsaquo=6d5;ek&lscr=2kld;1k3&lsh=6nk;jw&lsim=6sy;ua&lsime=8el;1fu&lsimg=8en;1fw&lsqb=2j;w&lsquo=6c8;ds&lsquor=6ca;dw&lstrok=8y;68<=1o;n<cc=8fa;1gk<cir=8e1;1es<dot=6vq;105<hree=6vf;zm<imes=6vd;zj<larr=86u;1ac<quest=8e3;1eu<rPar=87q;1ax<ri=7gj;15n<rie=6us;yi<rif=7gi;15l&lurdshar=85m;194&luruhar=86e;19u&lvertneqq=6so,1e68;t9&lvnE=6so,1e68;ta&mDDot=6re;q4¯=4v;20&male=7k2;164&malt=7q8;16m&maltese=7q8;16l&map=6na;jg&mapsto=6na;jf&mapstodown=6nb;ji&mapstoleft=6n8;jb&mapstoup=6n9;jd&marker=7fy;152&mcomma=8bt;1d4&mcy=u4;c3&mdash=6c4;dn&measuredangle=6qp;ok&mfr=2koa;1le&mho=6jr;gjµ=51;29&mid=6qr;oq&midast=16;d&midcir=8hc;1j1·=53;2d&minus=6qa;nu&minusb=6u7;xj&minusd=6rc;q3&minusdu=8bu;1d5&mlcp=8gr;1ip&mldr=6cm;ec&mnplus=6qb;nw&models=6uf;xy&mopf=2kpq;1mn&mp=6qb;nx&mscr=2kle;1k4&mstpos=6ri;qf&mu=qk;9e&multimap=6uw;yp&mumap=6uw;yq&nGg=6vt,mw;10a&nGt=6sr,6he;tp&nGtv=6sr,mw;to&nLeftarrow=6od;lk&nLeftrightarrow=6oe;lm&nLl=6vs,mw;108&nLt=6sq,6he;tj&nLtv=6sq,mw;ti&nRightarrow=6of;lo&nVDash=6un;y7&nVdash=6um;y6&nabla=6pz;n8&nacute=90;6a&nang=6qo,6he;oh&nap=6rt;rb&napE=8ds,mw;1ei&napid=6rv,mw;re&napos=95;6f&napprox=6rt;ra&natur=7la;16g&natural=7la;16f&naturals=6j9;fw =4g;1e&nbump=6ry,mw;rm&nbumpe=6rz,mw;rr&ncap=8cj;1dp&ncaron=94;6e&ncedil=92;6c&ncong=6rr;r2&ncongdot=8dp,mw;1ee&ncup=8ci;1do&ncy=u5;c4&ndash=6c3;dm&ne=6sg;sl&neArr=6on;mb&nearhk=84k;18h&nearr=6mv;im&nearrow=6mv;il&nedot=6s0,mw;rv&nequiv=6si;sq&nesear=84o;18n&nesim=6rm,mw;qo&nexist=6pw;n3&nexists=6pw;n2&nfr=2kob;1lf&ngE=6sn,mw;t4&nge=6sx;u7&ngeq=6sx;u6&ngeqq=6sn,mw;t5&ngeqslant=8e6,mw;1f3&nges=8e6,mw;1f4&ngsim=6t1;uh&ngt=6sv;u1&ngtr=6sv;u0&nhArr=6oe;ln&nharr=6ni;ju&nhpar=8he;1j3&ni=6q3;nk&nis=6ws;11u&nisd=6wq;11s&niv=6q3;nj&njcy=uy;cw&nlArr=6od;ll&nlE=6sm,mw;sy&nlarr=6my;iu&nldr=6cl;ea&nle=6sw;u4&nleftarrow=6my;it&nleftrightarrow=6ni;jt&nleq=6sw;u3&nleqq=6sm,mw;sz&nleqslant=8e5,mw;1ex&nles=8e5,mw;1ey&nless=6su;tx&nlsim=6t0;uf&nlt=6su;ty&nltri=6wa;115&nltrie=6wc;11b&nmid=6qs;ou&nopf=2kpr;1mo¬=4s;1u¬in=6q1;ng¬inE=6wp,mw;11q¬indot=6wl,mw;11m¬inva=6q1;nf¬invb=6wn;11p¬invc=6wm;11o¬ni=6q4;nn¬niva=6q4;nm¬nivb=6wu;11w¬nivc=6wt;11v&npar=6qu;p4&nparallel=6qu;p2&nparsl=8hp,6hx;1j5&npart=6pu,mw;mw&npolint=8b8;1cu&npr=6tc;vd&nprcue=6w0;10q&npre=8fj,mw;1gw&nprec=6tc;vc&npreceq=8fj,mw;1gx&nrArr=6of;lp&nrarr=6mz;iw&nrarrc=84z,mw;18s&nrarrw=6n1,mw;ix&nrightarrow=6mz;iv&nrtri=6wb;118&nrtrie=6wd;11e&nsc=6td;vg&nsccue=6w1;10s&nsce=8fk,mw;1h2&nscr=2klf;1k5&nshortmid=6qs;os&nshortparallel=6qu;p1&nsim=6rl;qm&nsime=6ro;qx&nsimeq=6ro;qw&nsmid=6qs;ot&nspar=6qu;p3&nsqsube=6w2;10u&nsqsupe=6w3;10w&nsub=6tg;vs&nsubE=8g5,mw;1hv&nsube=6tk;w2&nsubset=6te,6he;vi&nsubseteq=6tk;w1&nsubseteqq=8g5,mw;1hw&nsucc=6td;vf&nsucceq=8fk,mw;1h3&nsup=6th;vt&nsupE=8g6,mw;1hz&nsupe=6tl;w5&nsupset=6tf,6he;vn&nsupseteq=6tl;w4&nsupseteqq=8g6,mw;1i0&ntgl=6t5;urñ=6p;42&ntlg=6t4;up&ntriangleleft=6wa;114&ntrianglelefteq=6wc;11a&ntriangleright=6wb;117&ntrianglerighteq=6wd;11d&nu=ql;9f&num=z;5&numero=6ja;fy&numsp=6br;d5&nvDash=6ul;y5&nvHarr=83o;17u&nvap=6rx,6he;ri&nvdash=6uk;y4&nvge=6sl,6he;su&nvgt=1q,6he;q&nvinfin=89q;1c5&nvlArr=83m;17s&nvle=6sk,6he;sr&nvlt=1o,6he;l&nvltrie=6us,6he;yf&nvrArr=83n;17t&nvrtrie=6ut,6he;yj&nvsim=6rg,6he;q6&nwArr=6om;ma&nwarhk=84j;18g&nwarr=6mu;ij&nwarrow=6mu;ii&nwnear=84n;18m&oS=79k;13hó=6r;44&oast=6u3;xd&ocir=6u2;xbô=6s;45&ocy=u6;c5&odash=6u5;xf&odblac=9d;6l&odiv=8c8;1dg&odot=6u1;x9&odsold=88s;1bn&oelig=9f;6n&ofcir=88v;1bp&ofr=2koc;1lg&ogon=kb;87ò=6q;43&ogt=88x;1br&ohbar=88l;1bi&ohm=q1;91&oint=6r2;pk&olarr=6nu;k7&olcir=88u;1bo&olcross=88r;1bm&oline=6da;en&olt=88w;1bq&omacr=99;6j&omega=qx;9u&omicron=qn;9h&omid=88m;1bj&ominus=6ty;x4&oopf=2kps;1mp&opar=88n;1bk&operp=88p;1bl&oplus=6tx;x2&or=6qw;p8&orarr=6nv;k9&ord=8d9;1ea&order=6k4;h1&orderof=6k4;h0ª=4q;1sº=56;2h&origof=6uu;yn&oror=8d2;1e4&orslope=8d3;1e5&orv=8d7;1e8&oscr=6k4;h2ø=6w;4a&osol=6u0;x7õ=6t;46&otimes=6tz;x6&otimesas=8c6;1deö=6u;47&ovbar=6yl;12x&par=6qt;oz¶=52;2a¶llel=6qt;ox&parsim=8hf;1j4&parsl=8hp;1j6&part=6pu;my&pcy=u7;c6&percnt=11;7&period=1a;h&permil=6cw;ed&perp=6ud;xw&pertenk=6cx;ee&pfr=2kod;1lh&phi=qu;9r&phiv=r9;a2&phmmat=6k3;gy&phone=7im;162&pi=qo;9i&pitchfork=6vo;101&piv=ra;a4&planck=6j3;fj&planckh=6j2;fh&plankv=6j3;fk&plus=17;f&plusacir=8bn;1cz&plusb=6u6;xh&pluscir=8bm;1cy&plusdo=6qc;nz&plusdu=8bp;1d1&pluse=8du;1el±=4x;23&plussim=8bq;1d2&plustwo=8br;1d3&pm=4x;24&pointint=8b9;1cv&popf=2kpt;1mq£=4j;1h&pr=6t6;uu&prE=8fn;1h7&prap=8fr;1he&prcue=6t8;v0&pre=8fj;1h0&prec=6t6;ut&precapprox=8fr;1hd&preccurlyeq=6t8;uz&preceq=8fj;1gz&precnapprox=8ft;1hh&precneqq=8fp;1h9&precnsim=6w8;10z&precsim=6ta;v5&prime=6cy;ef&primes=6jd;g2&prnE=8fp;1ha&prnap=8ft;1hi&prnsim=6w8;110&prod=6q7;np&profalar=6y6;12v&profline=6xe;12e&profsurf=6xf;12f&prop=6ql;oe&propto=6ql;oc&prsim=6ta;v6&prurel=6uo;y8&pscr=2klh;1k6&psi=qw;9t&puncsp=6bs;d6&qfr=2koe;1li&qint=8b0;1co&qopf=2kpu;1mr&qprime=6dz;es&qscr=2kli;1k7&quaternions=6j1;ff&quatint=8ba;1cw&quest=1r;t&questeq=6sf;si"=y;4&rAarr=6or;mh&rArr=6oi;lz&rAtail=84c;18b&rBarr=83z;181&rHar=86c;19s&race=6rh,mp;qb&racute=9h;6p&radic=6qi;o8&raemptyv=88j;1bg&rang=7vt;172&rangd=87m;1at&range=885;1b2&rangle=7vt;171»=57;2i&rarr=6mq;i6&rarrap=86t;1ab&rarrb=6p1;mm&rarrbfs=84g;18f&rarrc=84z;18t&rarrfs=84e;18d&rarrhk=6ne;jm&rarrlp=6ng;jq&rarrpl=85h;191&rarrsim=86s;1aa&rarrtl=6n7;j9&rarrw=6n1;iz&ratail=84a;189&ratio=6ra;pz&rationals=6je;g4&rbarr=83x;17y&rbbrk=7sj;16q&rbrace=3h;1b&rbrack=2l;y&rbrke=87g;1an&rbrksld=87i;1ap&rbrkslu=87k;1ar&rcaron=9l;6t&rcedil=9j;6r&rceil=6x5;124&rcub=3h;1c&rcy=u8;c7&rdca=853;18w&rdldhar=86h;19x&rdquo=6cd;e2&rdquor=6cd;e1&rdsh=6nn;k0&real=6jg;g9&realine=6jf;g6&realpart=6jg;g8&reals=6jh;gc&rect=7fx;151®=4u;1y&rfisht=871;1ah&rfloor=6x7;128&rfr=2kof;1lj&rhard=6o1;kr&rharu=6o0;ko&rharul=86k;1a0&rho=qp;9j&rhov=s1;ab&rightarrow=6mq;i4&rightarrowtail=6n7;j8&rightharpoondown=6o1;kp&rightharpoonup=6o0;km&rightleftarrows=6o4;kz&rightleftharpoons=6oc;lh&rightrightarrows=6o9;la&rightsquigarrow=6n1;iy&rightthreetimes=6vg;zn&ring=ka;86&risingdotseq=6s3;s3&rlarr=6o4;l0&rlhar=6oc;lj&rlm=6bz;dj&rmoust=71t;133&rmoustache=71t;132&rnmid=8ha;1iz&roang=7vx;176&roarr=6pq;mq&robrk=7vr;16w&ropar=87a;1al&ropf=2kpv;1ms&roplus=8by;1d7&rotimes=8c5;1dd&rpar=15;c&rpargt=87o;1av&rppolint=8b6;1cs&rrarr=6o9;lb&rsaquo=6d6;el&rscr=2klj;1k8&rsh=6nl;jy&rsqb=2l;z&rsquo=6c9;dv&rsquor=6c9;du&rthree=6vg;zo&rtimes=6ve;zk&rtri=7g9;15d&rtrie=6ut;ym&rtrif=7g8;15b&rtriltri=89a;1by&ruluhar=86g;19w&rx=6ji;ge&sacute=9n;6v&sbquo=6ca;dx&sc=6t7;ux&scE=8fo;1h8&scap=8fs;1hg&scaron=9t;71&sccue=6t9;v3&sce=8fk;1h6&scedil=9r;6z&scirc=9p;6x&scnE=8fq;1hc&scnap=8fu;1hk&scnsim=6w9;112&scpolint=8b7;1ct&scsim=6tb;va&scy=u9;c8&sdot=6v9;zd&sdotb=6u9;xn&sdote=8di;1ec&seArr=6oo;mc&searhk=84l;18j&searr=6mw;ip&searrow=6mw;io§=4n;1l&semi=1n;k&seswar=84p;18p&setminus=6qe;o2&setmn=6qe;o4&sext=7qu;16n&sfr=2kog;1lk&sfrown=6xu;12q&sharp=7lb;16h&shchcy=uh;cg&shcy=ug;cf&shortmid=6qr;oo&shortparallel=6qt;ow­=4t;1v&sigma=qr;9n&sigmaf=qq;9l&sigmav=qq;9m&sim=6rg;qa&simdot=8dm;1ed&sime=6rn;qu&simeq=6rn;qt&simg=8f2;1gb&simgE=8f4;1gd&siml=8f1;1ga&simlE=8f3;1gc&simne=6rq;r0&simplus=8bo;1d0&simrarr=86q;1a8&slarr=6mo;hw&smallsetminus=6qe;o0&smashp=8c3;1db&smeparsl=89w;1c7&smid=6qr;op&smile=6xv;12t&smt=8fe;1go&smte=8fg;1gr&smtes=8fg,1e68;1gq&softcy=uk;cj&sol=1b;i&solb=890;1bu&solbar=6yn;12y&sopf=2kpw;1mt&spades=7kw;166&spadesuit=7kw;165&spar=6qt;oy&sqcap=6tv;wx&sqcaps=6tv,1e68;wv&sqcup=6tw;x0&sqcups=6tw,1e68;wy&sqsub=6tr;wk&sqsube=6tt;wr&sqsubset=6tr;wj&sqsubseteq=6tt;wq&sqsup=6ts;wo&sqsupe=6tu;wu&sqsupset=6ts;wn&sqsupseteq=6tu;wt&squ=7fl;14v&square=7fl;14u&squarf=7fu;14y&squf=7fu;14z&srarr=6mq;i5&sscr=2klk;1k9&ssetmn=6qe;o3&ssmile=6xv;12s&sstarf=6va;ze&star=7ie;161&starf=7id;160&straightepsilon=s5;ac&straightphi=r9;a0&strns=4v;1z&sub=6te;vl&subE=8g5;1hy&subdot=8fx;1hn&sube=6ti;vw&subedot=8g3;1ht&submult=8g1;1hr&subnE=8gb;1i8&subne=6tm;w9&subplus=8fz;1hp&subrarr=86x;1ae&subset=6te;vk&subseteq=6ti;vv&subseteqq=8g5;1hx&subsetneq=6tm;w8&subsetneqq=8gb;1i7&subsim=8g7;1i3&subsub=8gl;1ij&subsup=8gj;1ih&succ=6t7;uw&succapprox=8fs;1hf&succcurlyeq=6t9;v2&succeq=8fk;1h5&succnapprox=8fu;1hj&succneqq=8fq;1hb&succnsim=6w9;111&succsim=6tb;v9&sum=6q9;nt&sung=7l6;16d&sup=6tf;vr¹=55;2g²=4y;25³=4z;26&supE=8g6;1i2&supdot=8fy;1ho&supdsub=8go;1im&supe=6tj;vz&supedot=8g4;1hu&suphsol=7ux;16s&suphsub=8gn;1il&suplarr=86z;1af&supmult=8g2;1hs&supnE=8gc;1ic&supne=6tn;wd&supplus=8g0;1hq&supset=6tf;vq&supseteq=6tj;vy&supseteqq=8g6;1i1&supsetneq=6tn;wc&supsetneqq=8gc;1ib&supsim=8g8;1i4&supsub=8gk;1ii&supsup=8gm;1ik&swArr=6op;md&swarhk=84m;18l&swarr=6mx;is&swarrow=6mx;ir&swnwar=84q;18rß=67;3k&target=6xi;12h&tau=qs;9o&tbrk=71w;135&tcaron=9x;75&tcedil=9v;73&tcy=ua;c9&tdot=6hn;f4&telrec=6xh;12g&tfr=2koh;1ll&there4=6r8;pv&therefore=6r8;pu&theta=qg;9a&thetasym=r5;9v&thetav=r5;9x&thickapprox=6rs;r3&thicksim=6rg;q7&thinsp=6bt;d8&thkap=6rs;r7&thksim=6rg;q8þ=72;4g&tilde=kc;89×=5z;3c×b=6u8;xl×bar=8c1;1da×d=8c0;1d9&tint=6r1;ph&toea=84o;18o&top=6uc;xt&topbot=6ye;12w&topcir=8hd;1j2&topf=2kpx;1mu&topfork=8gq;1io&tosa=84p;18q&tprime=6d0;eh&trade=6jm;gg&triangle=7g5;158&triangledown=7gf;15i&triangleleft=7gj;15m&trianglelefteq=6us;yh&triangleq=6sc;sg&triangleright=7g9;15c&trianglerighteq=6ut;yl&tridot=7ho;15r&trie=6sc;sh&triminus=8ca;1di&triplus=8c9;1dh&trisb=899;1bx&tritime=8cb;1dj&trpezium=736;13d&tscr=2kll;1ka&tscy=ue;cd&tshcy=uz;cx&tstrok=9z;77&twixt=6ss;tu&twoheadleftarrow=6n2;j0&twoheadrightarrow=6n4;j3&uArr=6oh;lv&uHar=86b;19rú=6y;4c&uarr=6mp;i1&ubrcy=v2;cz&ubreve=a5;7dû=6z;4d&ucy=ub;ca&udarr=6o5;l2&udblac=a9;7h&udhar=86m;1a3&ufisht=872;1ai&ufr=2koi;1lmù=6x;4b&uharl=6nz;kl&uharr=6ny;ki&uhblk=7eo;14n&ulcorn=6xo;12j&ulcorner=6xo;12i&ulcrop=6xb;12c&ultri=7i0;15u&umacr=a3;7b¨=4o;1p&uogon=ab;7j&uopf=2kpy;1mv&uparrow=6mp;i0&updownarrow=6mt;if&upharpoonleft=6nz;kj&upharpoonright=6ny;kg&uplus=6tq;wg&upsi=qt;9q&upsih=r6;9y&upsilon=qt;9p&upuparrows=6o8;l8&urcorn=6xp;12l&urcorner=6xp;12k&urcrop=6xa;12b&uring=a7;7f&urtri=7i1;15v&uscr=2klm;1kb&utdot=6wg;11h&utilde=a1;79&utri=7g5;159&utrif=7g4;157&uuarr=6o8;l9ü=70;4e&uwangle=887;1b4&vArr=6ol;m9&vBar=8h4;1iu&vBarv=8h5;1iv&vDash=6ug;y0&vangrt=87w;1az&varepsilon=s5;ad&varkappa=s0;a8&varnothing=6px;n4&varphi=r9;a1&varpi=ra;a3&varpropto=6ql;ob&varr=6mt;ig&varrho=s1;aa&varsigma=qq;9k&varsubsetneq=6tm,1e68;w6&varsubsetneqq=8gb,1e68;1i5&varsupsetneq=6tn,1e68;wa&varsupsetneqq=8gc,1e68;1i9&vartheta=r5;9w&vartriangleleft=6uq;y9&vartriangleright=6ur;yc&vcy=tu;bt&vdash=6ua;xp&vee=6qw;p7&veebar=6uz;yu&veeeq=6sa;sf&vellip=6we;11f&verbar=3g;19&vert=3g;1a&vfr=2koj;1ln&vltri=6uq;yb&vnsub=6te,6he;vj&vnsup=6tf,6he;vo&vopf=2kpz;1mw&vprop=6ql;od&vrtri=6ur;ye&vscr=2kln;1kc&vsubnE=8gb,1e68;1i6&vsubne=6tm,1e68;w7&vsupnE=8gc,1e68;1ia&vsupne=6tn,1e68;wb&vzigzag=87u;1ay&wcirc=ad;7l&wedbar=8db;1eb&wedge=6qv;p5&wedgeq=6s9;se&weierp=6jc;g0&wfr=2kok;1lo&wopf=2kq0;1mx&wp=6jc;g1&wr=6rk;qk&wreath=6rk;qj&wscr=2klo;1kd&xcap=6v6;z6&xcirc=7hr;15t&xcup=6v7;z9&xdtri=7gd;15f&xfr=2kol;1lp&xhArr=7wa;17o&xharr=7w7;17f&xi=qm;9g&xlArr=7w8;17i&xlarr=7w5;179&xmap=7wc;17q&xnis=6wr;11t&xodot=8ao;1ce&xopf=2kq1;1my&xoplus=8ap;1cg&xotime=8aq;1ci&xrArr=7w9;17l&xrarr=7w6;17c&xscr=2klp;1ke&xsqcup=8au;1cm&xuplus=8as;1ck&xutri=7g3;155&xvee=6v5;z2&xwedge=6v4;yzý=71;4f&yacy=un;cm&ycirc=af;7n&ycy=uj;ci¥=4l;1j&yfr=2kom;1lq&yicy=uv;ct&yopf=2kq2;1mz&yscr=2klq;1kf&yucy=um;clÿ=73;4h&zacute=ai;7q&zcaron=am;7u&zcy=tz;by&zdot=ak;7s&zeetrf=6js;gk&zeta=qe;98&zfr=2kon;1lr&zhcy=ty;bx&zigrarr=6ot;mi&zopf=2kq3;1n0&zscr=2klr;1kg&zwj=6bx;dh&zwnj=6bw;dg&"; +} + diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/CharacterReader.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/CharacterReader.java new file mode 100644 index 00000000..c29b4454 --- /dev/null +++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/CharacterReader.java @@ -0,0 +1,483 @@ +package ru.noties.markwon.html.jsoup.parser; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.Arrays; +import java.util.Locale; + +import ru.noties.markwon.html.jsoup.UncheckedIOException; +import ru.noties.markwon.html.jsoup.helper.Validate; + +/** + CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes. + */ +public final class CharacterReader { + static final char EOF = (char) -1; + private static final int maxStringCacheLen = 12; + static final int maxBufferLen = 1024 * 32; // visible for testing + private static final int readAheadLimit = (int) (maxBufferLen * 0.75); + + private final char[] charBuf; + private final Reader reader; + private int bufLength; + private int bufSplitPoint; + private int bufPos; + private int readerPos; + private int bufMark; + private final String[] stringCache = new String[512]; // holds reused strings in this doc, to lessen garbage + + public CharacterReader(Reader input, int sz) { + Validate.notNull(input); + Validate.isTrue(input.markSupported()); + reader = input; + charBuf = new char[sz > maxBufferLen ? maxBufferLen : sz]; + bufferUp(); + } + + public CharacterReader(Reader input) { + this(input, maxBufferLen); + } + + public CharacterReader(String input) { + this(new StringReader(input), input.length()); + } + + private void bufferUp() { + if (bufPos < bufSplitPoint) + return; + + try { + reader.skip(bufPos); + reader.mark(maxBufferLen); + final int read = reader.read(charBuf); + reader.reset(); + if (read != -1) { + bufLength = read; + readerPos += bufPos; + bufPos = 0; + bufMark = 0; + bufSplitPoint = bufLength > readAheadLimit ? readAheadLimit : bufLength; + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + /** + * Gets the current cursor position in the content. + * @return current position + */ + public int pos() { + return readerPos + bufPos; + } + + /** + * Tests if all the content has been read. + * @return true if nothing left to read. + */ + public boolean isEmpty() { + bufferUp(); + return bufPos >= bufLength; + } + + private boolean isEmptyNoBufferUp() { + return bufPos >= bufLength; + } + + /** + * Get the char at the current position. + * @return char + */ + public char current() { + bufferUp(); + return isEmptyNoBufferUp() ? EOF : charBuf[bufPos]; + } + + char consume() { + bufferUp(); + char val = isEmptyNoBufferUp() ? EOF : charBuf[bufPos]; + bufPos++; + return val; + } + + void unconsume() { + bufPos--; + } + + /** + * Moves the current position by one. + */ + public void advance() { + bufPos++; + } + + void mark() { + bufMark = bufPos; + } + + void rewindToMark() { + bufPos = bufMark; + } + + /** + * Returns the number of characters between the current position and the next instance of the input char + * @param c scan target + * @return offset between current position and next instance of target. -1 if not found. + */ + int nextIndexOf(char c) { + // doesn't handle scanning for surrogates + bufferUp(); + for (int i = bufPos; i < bufLength; i++) { + if (c == charBuf[i]) + return i - bufPos; + } + return -1; + } + + /** + * Returns the number of characters between the current position and the next instance of the input sequence + * + * @param seq scan target + * @return offset between current position and next instance of target. -1 if not found. + */ + int nextIndexOf(CharSequence seq) { + bufferUp(); + // doesn't handle scanning for surrogates + char startChar = seq.charAt(0); + for (int offset = bufPos; offset < bufLength; offset++) { + // scan to first instance of startchar: + if (startChar != charBuf[offset]) + while(++offset < bufLength && startChar != charBuf[offset]) { /* empty */ } + int i = offset + 1; + int last = i + seq.length()-1; + if (offset < bufLength && last <= bufLength) { + for (int j = 1; i < last && seq.charAt(j) == charBuf[i]; i++, j++) { /* empty */ } + if (i == last) // found full sequence + return offset - bufPos; + } + } + return -1; + } + + /** + * Reads characters up to the specific char. + * @param c the delimiter + * @return the chars read + */ + public String consumeTo(char c) { + int offset = nextIndexOf(c); + if (offset != -1) { + String consumed = cacheString(charBuf, stringCache, bufPos, offset); + bufPos += offset; + return consumed; + } else { + return consumeToEnd(); + } + } + + String consumeTo(String seq) { + int offset = nextIndexOf(seq); + if (offset != -1) { + String consumed = cacheString(charBuf, stringCache, bufPos, offset); + bufPos += offset; + return consumed; + } else { + return consumeToEnd(); + } + } + + /** + * Read characters until the first of any delimiters is found. + * @param chars delimiters to scan for + * @return characters read up to the matched delimiter. + */ + public String consumeToAny(final char... chars) { + bufferUp(); + final int start = bufPos; + final int remaining = bufLength; + final char[] val = charBuf; + + OUTER: while (bufPos < remaining) { + for (char c : chars) { + if (val[bufPos] == c) + break OUTER; + } + bufPos++; + } + + return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : ""; + } + + String consumeToAnySorted(final char... chars) { + bufferUp(); + final int start = bufPos; + final int remaining = bufLength; + final char[] val = charBuf; + + while (bufPos < remaining) { + if (Arrays.binarySearch(chars, val[bufPos]) >= 0) + break; + bufPos++; + } + + return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : ""; + } + + String consumeData() { + // &, <, null + bufferUp(); + final int start = bufPos; + final int remaining = bufLength; + final char[] val = charBuf; + + while (bufPos < remaining) { + final char c = val[bufPos]; + if (c == '&'|| c == '<' || c == TokeniserState.nullChar) + break; + bufPos++; + } + + return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : ""; + } + + String consumeTagName() { + // '\t', '\n', '\r', '\f', ' ', '/', '>', nullChar + bufferUp(); + final int start = bufPos; + final int remaining = bufLength; + final char[] val = charBuf; + + while (bufPos < remaining) { + final char c = val[bufPos]; + if (c == '\t'|| c == '\n'|| c == '\r'|| c == '\f'|| c == ' '|| c == '/'|| c == '>'|| c == TokeniserState.nullChar) + break; + bufPos++; + } + + return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : ""; + } + + String consumeToEnd() { + bufferUp(); + String data = cacheString(charBuf, stringCache, bufPos, bufLength - bufPos); + bufPos = bufLength; + return data; + } + + String consumeLetterSequence() { + bufferUp(); + int start = bufPos; + while (bufPos < bufLength) { + char c = charBuf[bufPos]; + if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c)) + bufPos++; + else + break; + } + + return cacheString(charBuf, stringCache, start, bufPos - start); + } + + String consumeLetterThenDigitSequence() { + bufferUp(); + int start = bufPos; + while (bufPos < bufLength) { + char c = charBuf[bufPos]; + if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c)) + bufPos++; + else + break; + } + while (!isEmptyNoBufferUp()) { + char c = charBuf[bufPos]; + if (c >= '0' && c <= '9') + bufPos++; + else + break; + } + + return cacheString(charBuf, stringCache, start, bufPos - start); + } + + String consumeHexSequence() { + bufferUp(); + int start = bufPos; + while (bufPos < bufLength) { + char c = charBuf[bufPos]; + if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) + bufPos++; + else + break; + } + return cacheString(charBuf, stringCache, start, bufPos - start); + } + + String consumeDigitSequence() { + bufferUp(); + int start = bufPos; + while (bufPos < bufLength) { + char c = charBuf[bufPos]; + if (c >= '0' && c <= '9') + bufPos++; + else + break; + } + return cacheString(charBuf, stringCache, start, bufPos - start); + } + + boolean matches(char c) { + return !isEmpty() && charBuf[bufPos] == c; + + } + + boolean matches(String seq) { + bufferUp(); + int scanLength = seq.length(); + if (scanLength > bufLength - bufPos) + return false; + + for (int offset = 0; offset < scanLength; offset++) + if (seq.charAt(offset) != charBuf[bufPos +offset]) + return false; + return true; + } + + boolean matchesIgnoreCase(String seq) { + bufferUp(); + int scanLength = seq.length(); + if (scanLength > bufLength - bufPos) + return false; + + for (int offset = 0; offset < scanLength; offset++) { + char upScan = Character.toUpperCase(seq.charAt(offset)); + char upTarget = Character.toUpperCase(charBuf[bufPos + offset]); + if (upScan != upTarget) + return false; + } + return true; + } + + boolean matchesAny(char... seq) { + if (isEmpty()) + return false; + + bufferUp(); + char c = charBuf[bufPos]; + for (char seek : seq) { + if (seek == c) + return true; + } + return false; + } + + boolean matchesAnySorted(char[] seq) { + bufferUp(); + return !isEmpty() && Arrays.binarySearch(seq, charBuf[bufPos]) >= 0; + } + + boolean matchesLetter() { + if (isEmpty()) + return false; + char c = charBuf[bufPos]; + return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c); + } + + boolean matchesDigit() { + if (isEmpty()) + return false; + char c = charBuf[bufPos]; + return (c >= '0' && c <= '9'); + } + + boolean matchConsume(String seq) { + bufferUp(); + if (matches(seq)) { + bufPos += seq.length(); + return true; + } else { + return false; + } + } + + boolean matchConsumeIgnoreCase(String seq) { + if (matchesIgnoreCase(seq)) { + bufPos += seq.length(); + return true; + } else { + return false; + } + } + + boolean containsIgnoreCase(String seq) { + // used to check presence of , . only finds consistent case. + String loScan = seq.toLowerCase(Locale.ENGLISH); + String hiScan = seq.toUpperCase(Locale.ENGLISH); + return (nextIndexOf(loScan) > -1) || (nextIndexOf(hiScan) > -1); + } + + @Override + public String toString() { + return new String(charBuf, bufPos, bufLength - bufPos); + } + + /** + * Caches short strings, as a flywheel pattern, to reduce GC load. Just for this doc, to prevent leaks. + *

+ * Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list. + * That saves both having to create objects as hash keys, and running through the entry list, at the expense of + * some more duplicates. + */ + private static String cacheString(final char[] charBuf, final String[] stringCache, final int start, final int count) { + // limit (no cache): + if (count > maxStringCacheLen) + return new String(charBuf, start, count); + if (count < 1) + return ""; + + // calculate hash: + int hash = 0; + int offset = start; + for (int i = 0; i < count; i++) { + hash = 31 * hash + charBuf[offset++]; + } + + // get from cache + final int index = hash & stringCache.length - 1; + String cached = stringCache[index]; + + if (cached == null) { // miss, add + cached = new String(charBuf, start, count); + stringCache[index] = cached; + } else { // hashcode hit, check equality + if (rangeEquals(charBuf, start, count, cached)) { // hit + return cached; + } else { // hashcode conflict + cached = new String(charBuf, start, count); + stringCache[index] = cached; // update the cache, as recently used strings are more likely to show up again + } + } + return cached; + } + + /** + * Check if the value of the provided range equals the string. + */ + static boolean rangeEquals(final char[] charBuf, final int start, int count, final String cached) { + if (count == cached.length()) { + int i = start; + int j = 0; + while (count-- != 0) { + if (charBuf[i++] != cached.charAt(j++)) + return false; + } + return true; + } + return false; + } + + // just used for testing + boolean rangeEquals(final int start, final int count, final String cached) { + return rangeEquals(charBuf, start, count, cached); + } +} diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/ParseError.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/ParseError.java new file mode 100644 index 00000000..533f9aee --- /dev/null +++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/ParseError.java @@ -0,0 +1,41 @@ +package ru.noties.markwon.html.jsoup.parser; + +/** + * A Parse Error records an error in the input HTML that occurs in either the tokenisation or the tree building phase. + */ +public class ParseError { + private int pos; + private String errorMsg; + + ParseError(int pos, String errorMsg) { + this.pos = pos; + this.errorMsg = errorMsg; + } + + ParseError(int pos, String errorFormat, Object... args) { + this.errorMsg = String.format(errorFormat, args); + this.pos = pos; + } + + /** + * Retrieve the error message. + * @return the error message. + */ + public String getErrorMessage() { + return errorMsg; + } + + /** + * Retrieves the offset of the error. + * @return error offset within input + */ + public int getPosition() { + return pos; + } + + @Override + public String toString() { + return pos + ": " + errorMsg; + } +} + diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/ParseErrorList.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/ParseErrorList.java new file mode 100644 index 00000000..a3e42a08 --- /dev/null +++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/ParseErrorList.java @@ -0,0 +1,34 @@ +package ru.noties.markwon.html.jsoup.parser; + +import java.util.ArrayList; + +/** + * A container for ParseErrors. + * + * @author Jonathan Hedley + */ +public class ParseErrorList extends ArrayList{ + private static final int INITIAL_CAPACITY = 16; + private final int maxSize; + + ParseErrorList(int initialCapacity, int maxSize) { + super(initialCapacity); + this.maxSize = maxSize; + } + + boolean canAddError() { + return size() < maxSize; + } + + int getMaxSize() { + return maxSize; + } + + public static ParseErrorList noTracking() { + return new ParseErrorList(0, 0); + } + + public static ParseErrorList tracking(int maxSize) { + return new ParseErrorList(INITIAL_CAPACITY, maxSize); + } +} diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/Token.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/Token.java new file mode 100644 index 00000000..0b157d07 --- /dev/null +++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/Token.java @@ -0,0 +1,398 @@ +package ru.noties.markwon.html.jsoup.parser; + +import android.support.annotation.NonNull; + +import ru.noties.markwon.html.jsoup.helper.Validate; +import ru.noties.markwon.html.jsoup.nodes.Attributes; + +import static ru.noties.markwon.html.jsoup.helper.Normalizer.lowerCase; + +/** + * Parse tokens for the Tokeniser. + */ +public abstract class Token { + + public final TokenType type; + + protected Token(@NonNull TokenType tokenType) { + this.type = tokenType; + } + +// String tokenType() { +// return this.getClass().getSimpleName(); +// } + + /** + * Reset the data represent by this token, for reuse. Prevents the need to create transfer objects for every + * piece of data, which immediately get GCed. + */ + public abstract Token reset(); + + static void reset(StringBuilder sb) { + if (sb != null) { + sb.delete(0, sb.length()); + } + } + + public static final class Doctype extends Token { + final StringBuilder name = new StringBuilder(); + String pubSysKey = null; + final StringBuilder publicIdentifier = new StringBuilder(); + final StringBuilder systemIdentifier = new StringBuilder(); + boolean forceQuirks = false; + + Doctype() { + super(TokenType.Doctype); + } + + @Override + public Token reset() { + reset(name); + pubSysKey = null; + reset(publicIdentifier); + reset(systemIdentifier); + forceQuirks = false; + return this; + } + + String getName() { + return name.toString(); + } + + String getPubSysKey() { + return pubSysKey; + } + + String getPublicIdentifier() { + return publicIdentifier.toString(); + } + + public String getSystemIdentifier() { + return systemIdentifier.toString(); + } + + public boolean isForceQuirks() { + return forceQuirks; + } + } + + public static abstract class Tag extends Token { + + public String tagName; + public String normalName; // lc version of tag name, for case insensitive tree build + private String pendingAttributeName; // attribute names are generally caught in one hop, not accumulated + private StringBuilder pendingAttributeValue = new StringBuilder(); // but values are accumulated, from e.g. & in hrefs + private String pendingAttributeValueS; // try to get attr vals in one shot, vs Builder + private boolean hasEmptyAttributeValue = false; // distinguish boolean attribute from empty string value + private boolean hasPendingAttributeValue = false; + public boolean selfClosing = false; + public Attributes attributes; // start tags get attributes on construction. End tags get attributes on first new attribute (but only for parser convenience, not used). + + protected Tag(@NonNull TokenType tokenType) { + super(tokenType); + } + + @Override + public Tag reset() { + tagName = null; + normalName = null; + pendingAttributeName = null; + reset(pendingAttributeValue); + pendingAttributeValueS = null; + hasEmptyAttributeValue = false; + hasPendingAttributeValue = false; + selfClosing = false; + attributes = null; + return this; + } + + final void newAttribute() { + if (attributes == null) + attributes = new Attributes(); + + if (pendingAttributeName != null) { + // the tokeniser has skipped whitespace control chars, but trimming could collapse to empty for other control codes, so verify here + pendingAttributeName = pendingAttributeName.trim(); + if (pendingAttributeName.length() > 0) { + String value; + if (hasPendingAttributeValue) + value = pendingAttributeValue.length() > 0 ? pendingAttributeValue.toString() : pendingAttributeValueS; + else if (hasEmptyAttributeValue) + value = ""; + else + value = null; + attributes.put(pendingAttributeName, value); + } + } + pendingAttributeName = null; + hasEmptyAttributeValue = false; + hasPendingAttributeValue = false; + reset(pendingAttributeValue); + pendingAttributeValueS = null; + } + + final void finaliseTag() { + // finalises for emit + if (pendingAttributeName != null) { + // todo: check if attribute name exists; if so, drop and error + newAttribute(); + } + } + + final String name() { // preserves case, for input into Tag.valueOf (which may drop case) + Validate.isFalse(tagName == null || tagName.length() == 0); + return tagName; + } + + final String normalName() { // loses case, used in tree building for working out where in tree it should go + return normalName; + } + + final Tag name(String name) { + tagName = name; + normalName = lowerCase(name); + return this; + } + + final boolean isSelfClosing() { + return selfClosing; + } + + @SuppressWarnings({"TypeMayBeWeakened"}) + final Attributes getAttributes() { + return attributes; + } + + // these appenders are rarely hit in not null state-- caused by null chars. + final void appendTagName(String append) { + tagName = tagName == null ? append : tagName.concat(append); + normalName = lowerCase(tagName); + } + + final void appendTagName(char append) { + appendTagName(String.valueOf(append)); + } + + final void appendAttributeName(String append) { + pendingAttributeName = pendingAttributeName == null ? append : pendingAttributeName.concat(append); + } + + final void appendAttributeName(char append) { + appendAttributeName(String.valueOf(append)); + } + + final void appendAttributeValue(String append) { + ensureAttributeValue(); + if (pendingAttributeValue.length() == 0) { + pendingAttributeValueS = append; + } else { + pendingAttributeValue.append(append); + } + } + + final void appendAttributeValue(char append) { + ensureAttributeValue(); + pendingAttributeValue.append(append); + } + + final void appendAttributeValue(char[] append) { + ensureAttributeValue(); + pendingAttributeValue.append(append); + } + + final void appendAttributeValue(int[] appendCodepoints) { + ensureAttributeValue(); + for (int codepoint : appendCodepoints) { + pendingAttributeValue.appendCodePoint(codepoint); + } + } + + final void setEmptyAttributeValue() { + hasEmptyAttributeValue = true; + } + + private void ensureAttributeValue() { + hasPendingAttributeValue = true; + // if on second hit, we'll need to move to the builder + if (pendingAttributeValueS != null) { + pendingAttributeValue.append(pendingAttributeValueS); + pendingAttributeValueS = null; + } + } + } + + public final static class StartTag extends Tag { + StartTag() { + super(TokenType.StartTag); + attributes = new Attributes(); + } + + @Override + public Tag reset() { + super.reset(); + attributes = new Attributes(); + // todo - would prefer these to be null, but need to check Element assertions + return this; + } + + StartTag nameAttr(String name, Attributes attributes) { + this.tagName = name; + this.attributes = attributes; + normalName = lowerCase(tagName); + return this; + } + + @Override + public String toString() { + if (attributes != null && attributes.size() > 0) + return "<" + name() + " " + attributes.toString() + ">"; + else + return "<" + name() + ">"; + } + } + + public final static class EndTag extends Tag{ + EndTag() { + super(TokenType.EndTag); + } + + @Override + public String toString() { + return ""; + } + } + + public final static class Comment extends Token { + final StringBuilder data = new StringBuilder(); + boolean bogus = false; + + @Override + public Token reset() { + reset(data); + bogus = false; + return this; + } + + Comment() { + super(TokenType.Comment); + } + + String getData() { + return data.toString(); + } + + @Override + public String toString() { + return ""; + } + } + + public static class Character extends Token { + private String data; + + Character() { + super(TokenType.Character); + } + + @Override + public Token reset() { + data = null; + return this; + } + + Character data(String data) { + this.data = data; + return this; + } + + public String getData() { + return data; + } + + @Override + public String toString() { + return getData(); + } + } + + public final static class CData extends Character { + CData(String data) { + super(); + this.data(data); + } + + @Override + public String toString() { + return ""; + } + + } + + public final static class EOF extends Token { + EOF() { + super(Token.TokenType.EOF); + } + + @Override + public Token reset() { + return this; + } + } + +// final boolean isDoctype() { +// return type == TokenType.Doctype; +// } +// +// final Doctype asDoctype() { +// return (Doctype) this; +// } +// +// final boolean isStartTag() { +// return type == TokenType.StartTag; +// } +// +// final StartTag asStartTag() { +// return (StartTag) this; +// } +// +// final boolean isEndTag() { +// return type == TokenType.EndTag; +// } +// +// final EndTag asEndTag() { +// return (EndTag) this; +// } +// +// final boolean isComment() { +// return type == TokenType.Comment; +// } +// +// final Comment asComment() { +// return (Comment) this; +// } +// +// final boolean isCharacter() { +// return type == TokenType.Character; +// } +// +// final boolean isCData() { +// return this instanceof CData; +// } +// +// final Character asCharacter() { +// return (Character) this; +// } +// +// final boolean isEOF() { +// return type == TokenType.EOF; +// } + + public enum TokenType { + Doctype, + StartTag, + EndTag, + Comment, + Character, // note no CData - treated in builder as an extension of Character + EOF + } +} diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/Tokeniser.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/Tokeniser.java new file mode 100644 index 00000000..3d5284bd --- /dev/null +++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/Tokeniser.java @@ -0,0 +1,295 @@ +package ru.noties.markwon.html.jsoup.parser; + +import java.util.Arrays; + +import ru.noties.markwon.html.jsoup.helper.Validate; +import ru.noties.markwon.html.jsoup.nodes.Entities; + +/** + * Readers the input stream into tokens. + */ +public final class Tokeniser { + static final char replacementChar = '\uFFFD'; // replaces null character + private static final char[] notCharRefCharsSorted = new char[]{'\t', '\n', '\r', '\f', ' ', '<', '&'}; + + // Some illegal character escapes are parsed by browsers as windows-1252 instead. See issue #1034 + // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state + static final int win1252ExtensionsStart = 0x80; + static final int[] win1252Extensions = new int[] { + // we could build this manually, but Windows-1252 is not a standard java charset so that could break on + // some platforms - this table is verified with a test + 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, + 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, + 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, + }; + + static { + Arrays.sort(notCharRefCharsSorted); + } + + private final CharacterReader reader; // html input + private final ParseErrorList errors; // errors found while tokenising + + private TokeniserState state = TokeniserState.Data; // current tokenisation state + private Token emitPending; // the token we are about to emit on next read + private boolean isEmitPending = false; + private String charsString = null; // characters pending an emit. Will fall to charsBuilder if more than one + private StringBuilder charsBuilder = new StringBuilder(1024); // buffers characters to output as one token, if more than one emit per read + StringBuilder dataBuffer = new StringBuilder(1024); // buffers data looking for + + Token.Tag tagPending; // tag we are building up + Token.StartTag startPending = new Token.StartTag(); + Token.EndTag endPending = new Token.EndTag(); + Token.Character charPending = new Token.Character(); + Token.Doctype doctypePending = new Token.Doctype(); // doctype building up + Token.Comment commentPending = new Token.Comment(); // comment building up + private String lastStartTag; // the last start tag emitted, to test appropriate end tag + + public Tokeniser(CharacterReader reader, ParseErrorList errors) { + this.reader = reader; + this.errors = errors; + } + + public Token read() { + while (!isEmitPending) + state.read(this, reader); + + // if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read: + if (charsBuilder.length() > 0) { + String str = charsBuilder.toString(); + charsBuilder.delete(0, charsBuilder.length()); + charsString = null; + return charPending.data(str); + } else if (charsString != null) { + Token token = charPending.data(charsString); + charsString = null; + return token; + } else { + isEmitPending = false; + return emitPending; + } + } + + void emit(Token token) { + Validate.isFalse(isEmitPending, "There is an unread token pending!"); + + emitPending = token; + isEmitPending = true; + + if (token.type == Token.TokenType.StartTag) { + Token.StartTag startTag = (Token.StartTag) token; + lastStartTag = startTag.tagName; + } else if (token.type == Token.TokenType.EndTag) { + Token.EndTag endTag = (Token.EndTag) token; + if (endTag.attributes != null) + error("Attributes incorrectly present on end tag"); + } + } + + void emit(final String str) { + // buffer strings up until last string token found, to emit only one token for a run of character refs etc. + // does not set isEmitPending; read checks that + if (charsString == null) { + charsString = str; + } + else { + if (charsBuilder.length() == 0) { // switching to string builder as more than one emit before read + charsBuilder.append(charsString); + } + charsBuilder.append(str); + } + } + + void emit(char[] chars) { + emit(String.valueOf(chars)); + } + + void emit(int[] codepoints) { + emit(new String(codepoints, 0, codepoints.length)); + } + + void emit(char c) { + emit(String.valueOf(c)); + } + + TokeniserState getState() { + return state; + } + + void transition(TokeniserState state) { + this.state = state; + } + + void advanceTransition(TokeniserState state) { + reader.advance(); + this.state = state; + } + + final private int[] codepointHolder = new int[1]; // holder to not have to keep creating arrays + final private int[] multipointHolder = new int[2]; + int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) { + if (reader.isEmpty()) + return null; + if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current()) + return null; + if (reader.matchesAnySorted(notCharRefCharsSorted)) + return null; + + final int[] codeRef = codepointHolder; + reader.mark(); + if (reader.matchConsume("#")) { // numbered + boolean isHexMode = reader.matchConsumeIgnoreCase("X"); + String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence(); + if (numRef.length() == 0) { // didn't match anything + characterReferenceError("numeric reference with no numerals"); + reader.rewindToMark(); + return null; + } + if (!reader.matchConsume(";")) + characterReferenceError("missing semicolon"); // missing semi + int charval = -1; + try { + int base = isHexMode ? 16 : 10; + charval = Integer.valueOf(numRef, base); + } catch (NumberFormatException ignored) { + } // skip + if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) { + characterReferenceError("character outside of valid range"); + codeRef[0] = replacementChar; + return codeRef; + } else { + // fix illegal unicode characters to match browser behavior + if (charval >= win1252ExtensionsStart && charval < win1252ExtensionsStart + win1252Extensions.length) { + characterReferenceError("character is not a valid unicode code point"); + charval = win1252Extensions[charval - win1252ExtensionsStart]; + } + + // todo: implement number replacement table + // todo: check for extra illegal unicode points as parse errors + codeRef[0] = charval; + return codeRef; + } + } else { // named + // get as many letters as possible, and look for matching entities. + String nameRef = reader.consumeLetterThenDigitSequence(); + boolean looksLegit = reader.matches(';'); + // found if a base named entity without a ;, or an extended entity with the ;. + boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit)); + + if (!found) { + reader.rewindToMark(); + if (looksLegit) // named with semicolon + characterReferenceError(String.format("invalid named referenece '%s'", nameRef)); + return null; + } + if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) { + // don't want that to match + reader.rewindToMark(); + return null; + } + if (!reader.matchConsume(";")) + characterReferenceError("missing semicolon"); // missing semi + int numChars = Entities.codepointsForName(nameRef, multipointHolder); + if (numChars == 1) { + codeRef[0] = multipointHolder[0]; + return codeRef; + } else if (numChars ==2) { + return multipointHolder; + } else { + Validate.fail("Unexpected characters returned for " + nameRef); + return multipointHolder; + } + } + } + + Token.Tag createTagPending(boolean start) { + tagPending = start ? startPending.reset() : endPending.reset(); + return tagPending; + } + + void emitTagPending() { + tagPending.finaliseTag(); + emit(tagPending); + } + + void createCommentPending() { + commentPending.reset(); + } + + void emitCommentPending() { + emit(commentPending); + } + + void createDoctypePending() { + doctypePending.reset(); + } + + void emitDoctypePending() { + emit(doctypePending); + } + + void createTempBuffer() { + Token.reset(dataBuffer); + } + + boolean isAppropriateEndTagToken() { + return lastStartTag != null && tagPending.name().equalsIgnoreCase(lastStartTag); + } + + String appropriateEndTagName() { + return lastStartTag; // could be null + } + + void error(TokeniserState state) { + if (errors.canAddError()) + errors.add(new ParseError(reader.pos(), "Unexpected character '%s' in input state [%s]", reader.current(), state)); + } + + void eofError(TokeniserState state) { + if (errors.canAddError()) + errors.add(new ParseError(reader.pos(), "Unexpectedly reached end of file (EOF) in input state [%s]", state)); + } + + private void characterReferenceError(String message) { + if (errors.canAddError()) + errors.add(new ParseError(reader.pos(), "Invalid character reference: %s", message)); + } + + void error(String errorMsg) { + if (errors.canAddError()) + errors.add(new ParseError(reader.pos(), errorMsg)); + } + + boolean currentNodeInHtmlNS() { + // todo: implement namespaces correctly + return true; + // Element currentNode = currentNode(); + // return currentNode != null && currentNode.namespace().equals("HTML"); + } + +// /** +// * Utility method to consume reader and unescape entities found within. +// * @param inAttribute if the text to be unescaped is in an attribute +// * @return unescaped string from reader +// */ +// String unescapeEntities(boolean inAttribute) { +// StringBuilder builder = StringUtil.stringBuilder(); +// while (!reader.isEmpty()) { +// builder.append(reader.consumeTo('&')); +// if (reader.matches('&')) { +// reader.consume(); +// int[] c = consumeCharacterReference(null, inAttribute); +// if (c == null || c.length==0) +// builder.append('&'); +// else { +// builder.appendCodePoint(c[0]); +// if (c.length == 2) +// builder.appendCodePoint(c[1]); +// } +// +// } +// } +// return builder.toString(); +// } +} diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/TokeniserState.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/TokeniserState.java new file mode 100644 index 00000000..01a98958 --- /dev/null +++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/jsoup/parser/TokeniserState.java @@ -0,0 +1,1737 @@ +package ru.noties.markwon.html.jsoup.parser; + +import ru.noties.markwon.html.jsoup.nodes.DocumentType; + +/** + * States and transition activations for the Tokeniser. + */ +enum TokeniserState { + Data { + // in data state, gather characters until a character reference or tag is found + void read(Tokeniser t, CharacterReader r) { + switch (r.current()) { + case '&': + t.advanceTransition(CharacterReferenceInData); + break; + case '<': + t.advanceTransition(TagOpen); + break; + case nullChar: + t.error(this); // NOT replacement character (oddly?) + t.emit(r.consume()); + break; + case eof: + t.emit(new Token.EOF()); + break; + default: + String data = r.consumeData(); + t.emit(data); + break; + } + } + }, + CharacterReferenceInData { + // from & in data + void read(Tokeniser t, CharacterReader r) { + readCharRef(t, Data); + } + }, + Rcdata { + /// handles data in title, textarea etc + void read(Tokeniser t, CharacterReader r) { + switch (r.current()) { + case '&': + t.advanceTransition(CharacterReferenceInRcdata); + break; + case '<': + t.advanceTransition(RcdataLessthanSign); + break; + case nullChar: + t.error(this); + r.advance(); + t.emit(replacementChar); + break; + case eof: + t.emit(new Token.EOF()); + break; + default: + String data = r.consumeToAny('&', '<', nullChar); + t.emit(data); + break; + } + } + }, + CharacterReferenceInRcdata { + void read(Tokeniser t, CharacterReader r) { + readCharRef(t, Rcdata); + } + }, + Rawtext { + void read(Tokeniser t, CharacterReader r) { + readData(t, r, this, RawtextLessthanSign); + } + }, + ScriptData { + void read(Tokeniser t, CharacterReader r) { + readData(t, r, this, ScriptDataLessthanSign); + } + }, + PLAINTEXT { + void read(Tokeniser t, CharacterReader r) { + switch (r.current()) { + case nullChar: + t.error(this); + r.advance(); + t.emit(replacementChar); + break; + case eof: + t.emit(new Token.EOF()); + break; + default: + String data = r.consumeTo(nullChar); + t.emit(data); + break; + } + } + }, + TagOpen { + // from < in data + void read(Tokeniser t, CharacterReader r) { + switch (r.current()) { + case '!': + t.advanceTransition(MarkupDeclarationOpen); + break; + case '/': + t.advanceTransition(EndTagOpen); + break; + case '?': + t.advanceTransition(BogusComment); + break; + default: + if (r.matchesLetter()) { + t.createTagPending(true); + t.transition(TagName); + } else { + t.error(this); + t.emit('<'); // char that got us here + t.transition(Data); + } + break; + } + } + }, + EndTagOpen { + void read(Tokeniser t, CharacterReader r) { + if (r.isEmpty()) { + t.eofError(this); + t.emit("')) { + t.error(this); + t.advanceTransition(Data); + } else { + t.error(this); + t.advanceTransition(BogusComment); + } + } + }, + TagName { + // from < or ': + t.emitTagPending(); + t.transition(Data); + break; + case nullChar: // replacement + t.tagPending.appendTagName(replacementStr); + break; + case eof: // should emit pending tag? + t.eofError(this); + t.transition(Data); + break; + default: // buffer underrun + t.tagPending.appendTagName(c); + } + } + }, + RcdataLessthanSign { + // from < in rcdata + void read(Tokeniser t, CharacterReader r) { + if (r.matches('/')) { + t.createTempBuffer(); + t.advanceTransition(RCDATAEndTagOpen); + } else if (r.matchesLetter() && t.appropriateEndTagName() != null && !r.containsIgnoreCase("), so rather than + // consuming to EOF; break out here + t.tagPending = t.createTagPending(false).name(t.appropriateEndTagName()); + t.emitTagPending(); + r.unconsume(); // undo "<" + t.transition(Data); + } else { + t.emit("<"); + t.transition(Rcdata); + } + } + }, + RCDATAEndTagOpen { + void read(Tokeniser t, CharacterReader r) { + if (r.matchesLetter()) { + t.createTagPending(false); + t.tagPending.appendTagName(r.current()); + t.dataBuffer.append(r.current()); + t.advanceTransition(RCDATAEndTagName); + } else { + t.emit("': + if (t.isAppropriateEndTagToken()) { + t.emitTagPending(); + t.transition(Data); + } + else + anythingElse(t, r); + break; + default: + anythingElse(t, r); + } + } + + private void anythingElse(Tokeniser t, CharacterReader r) { + t.emit("': + t.emit(c); + t.transition(ScriptData); + break; + case nullChar: + t.error(this); + t.emit(replacementChar); + t.transition(ScriptDataEscaped); + break; + default: + t.emit(c); + t.transition(ScriptDataEscaped); + } + } + }, + ScriptDataEscapedLessthanSign { + void read(Tokeniser t, CharacterReader r) { + if (r.matchesLetter()) { + t.createTempBuffer(); + t.dataBuffer.append(r.current()); + t.emit("<" + r.current()); + t.advanceTransition(ScriptDataDoubleEscapeStart); + } else if (r.matches('/')) { + t.createTempBuffer(); + t.advanceTransition(ScriptDataEscapedEndTagOpen); + } else { + t.emit('<'); + t.transition(ScriptDataEscaped); + } + } + }, + ScriptDataEscapedEndTagOpen { + void read(Tokeniser t, CharacterReader r) { + if (r.matchesLetter()) { + t.createTagPending(false); + t.tagPending.appendTagName(r.current()); + t.dataBuffer.append(r.current()); + t.advanceTransition(ScriptDataEscapedEndTagName); + } else { + t.emit("': + t.emit(c); + t.transition(ScriptData); + break; + case nullChar: + t.error(this); + t.emit(replacementChar); + t.transition(ScriptDataDoubleEscaped); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + default: + t.emit(c); + t.transition(ScriptDataDoubleEscaped); + } + } + }, + ScriptDataDoubleEscapedLessthanSign { + void read(Tokeniser t, CharacterReader r) { + if (r.matches('/')) { + t.emit('/'); + t.createTempBuffer(); + t.advanceTransition(ScriptDataDoubleEscapeEnd); + } else { + t.transition(ScriptDataDoubleEscaped); + } + } + }, + ScriptDataDoubleEscapeEnd { + void read(Tokeniser t, CharacterReader r) { + handleDataDoubleEscapeTag(t,r, ScriptDataEscaped, ScriptDataDoubleEscaped); + } + }, + BeforeAttributeName { + // from tagname ': + t.emitTagPending(); + t.transition(Data); + break; + case nullChar: + t.error(this); + t.tagPending.newAttribute(); + r.unconsume(); + t.transition(AttributeName); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + case '"': + case '\'': + case '<': + case '=': + t.error(this); + t.tagPending.newAttribute(); + t.tagPending.appendAttributeName(c); + t.transition(AttributeName); + break; + default: // A-Z, anything else + t.tagPending.newAttribute(); + r.unconsume(); + t.transition(AttributeName); + } + } + }, + AttributeName { + // from before attribute name + void read(Tokeniser t, CharacterReader r) { + String name = r.consumeToAnySorted(attributeNameCharsSorted); + t.tagPending.appendAttributeName(name); + + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\r': + case '\f': + case ' ': + t.transition(AfterAttributeName); + break; + case '/': + t.transition(SelfClosingStartTag); + break; + case '=': + t.transition(BeforeAttributeValue); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + case nullChar: + t.error(this); + t.tagPending.appendAttributeName(replacementChar); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + case '"': + case '\'': + case '<': + t.error(this); + t.tagPending.appendAttributeName(c); + break; + default: // buffer underrun + t.tagPending.appendAttributeName(c); + } + } + }, + AfterAttributeName { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\r': + case '\f': + case ' ': + // ignore + break; + case '/': + t.transition(SelfClosingStartTag); + break; + case '=': + t.transition(BeforeAttributeValue); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + case nullChar: + t.error(this); + t.tagPending.appendAttributeName(replacementChar); + t.transition(AttributeName); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + case '"': + case '\'': + case '<': + t.error(this); + t.tagPending.newAttribute(); + t.tagPending.appendAttributeName(c); + t.transition(AttributeName); + break; + default: // A-Z, anything else + t.tagPending.newAttribute(); + r.unconsume(); + t.transition(AttributeName); + } + } + }, + BeforeAttributeValue { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\r': + case '\f': + case ' ': + // ignore + break; + case '"': + t.transition(AttributeValue_doubleQuoted); + break; + case '&': + r.unconsume(); + t.transition(AttributeValue_unquoted); + break; + case '\'': + t.transition(AttributeValue_singleQuoted); + break; + case nullChar: + t.error(this); + t.tagPending.appendAttributeValue(replacementChar); + t.transition(AttributeValue_unquoted); + break; + case eof: + t.eofError(this); + t.emitTagPending(); + t.transition(Data); + break; + case '>': + t.error(this); + t.emitTagPending(); + t.transition(Data); + break; + case '<': + case '=': + case '`': + t.error(this); + t.tagPending.appendAttributeValue(c); + t.transition(AttributeValue_unquoted); + break; + default: + r.unconsume(); + t.transition(AttributeValue_unquoted); + } + } + }, + AttributeValue_doubleQuoted { + void read(Tokeniser t, CharacterReader r) { + String value = r.consumeToAny(attributeDoubleValueCharsSorted); + if (value.length() > 0) + t.tagPending.appendAttributeValue(value); + else + t.tagPending.setEmptyAttributeValue(); + + char c = r.consume(); + switch (c) { + case '"': + t.transition(AfterAttributeValue_quoted); + break; + case '&': + int[] ref = t.consumeCharacterReference('"', true); + if (ref != null) + t.tagPending.appendAttributeValue(ref); + else + t.tagPending.appendAttributeValue('&'); + break; + case nullChar: + t.error(this); + t.tagPending.appendAttributeValue(replacementChar); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + default: // hit end of buffer in first read, still in attribute + t.tagPending.appendAttributeValue(c); + } + } + }, + AttributeValue_singleQuoted { + void read(Tokeniser t, CharacterReader r) { + String value = r.consumeToAny(attributeSingleValueCharsSorted); + if (value.length() > 0) + t.tagPending.appendAttributeValue(value); + else + t.tagPending.setEmptyAttributeValue(); + + char c = r.consume(); + switch (c) { + case '\'': + t.transition(AfterAttributeValue_quoted); + break; + case '&': + int[] ref = t.consumeCharacterReference('\'', true); + if (ref != null) + t.tagPending.appendAttributeValue(ref); + else + t.tagPending.appendAttributeValue('&'); + break; + case nullChar: + t.error(this); + t.tagPending.appendAttributeValue(replacementChar); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + default: // hit end of buffer in first read, still in attribute + t.tagPending.appendAttributeValue(c); + } + } + }, + AttributeValue_unquoted { + void read(Tokeniser t, CharacterReader r) { + String value = r.consumeToAnySorted(attributeValueUnquoted); + if (value.length() > 0) + t.tagPending.appendAttributeValue(value); + + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\r': + case '\f': + case ' ': + t.transition(BeforeAttributeName); + break; + case '&': + int[] ref = t.consumeCharacterReference('>', true); + if (ref != null) + t.tagPending.appendAttributeValue(ref); + else + t.tagPending.appendAttributeValue('&'); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + case nullChar: + t.error(this); + t.tagPending.appendAttributeValue(replacementChar); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + case '"': + case '\'': + case '<': + case '=': + case '`': + t.error(this); + t.tagPending.appendAttributeValue(c); + break; + default: // hit end of buffer in first read, still in attribute + t.tagPending.appendAttributeValue(c); + } + + } + }, + // CharacterReferenceInAttributeValue state handled inline + AfterAttributeValue_quoted { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\r': + case '\f': + case ' ': + t.transition(BeforeAttributeName); + break; + case '/': + t.transition(SelfClosingStartTag); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + default: + t.error(this); + r.unconsume(); + t.transition(BeforeAttributeName); + } + + } + }, + SelfClosingStartTag { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '>': + t.tagPending.selfClosing = true; + t.emitTagPending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + default: + t.error(this); + r.unconsume(); + t.transition(BeforeAttributeName); + } + } + }, + BogusComment { + void read(Tokeniser t, CharacterReader r) { + // todo: handle bogus comment starting from eof. when does that trigger? + // rewind to capture character that lead us here + r.unconsume(); + Token.Comment comment = new Token.Comment(); + comment.bogus = true; + comment.data.append(r.consumeTo('>')); + // todo: replace nullChar with replaceChar + t.emit(comment); + t.advanceTransition(Data); + } + }, + MarkupDeclarationOpen { + void read(Tokeniser t, CharacterReader r) { + if (r.matchConsume("--")) { + t.createCommentPending(); + t.transition(CommentStart); + } else if (r.matchConsumeIgnoreCase("DOCTYPE")) { + t.transition(Doctype); + } else if (r.matchConsume("[CDATA[")) { + // todo: should actually check current namepspace, and only non-html allows cdata. until namespace + // is implemented properly, keep handling as cdata + //} else if (!t.currentNodeInHtmlNS() && r.matchConsume("[CDATA[")) { + t.createTempBuffer(); + t.transition(CdataSection); + } else { + t.error(this); + t.advanceTransition(BogusComment); // advance so this character gets in bogus comment data's rewind + } + } + }, + CommentStart { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '-': + t.transition(CommentStartDash); + break; + case nullChar: + t.error(this); + t.commentPending.data.append(replacementChar); + t.transition(Comment); + break; + case '>': + t.error(this); + t.emitCommentPending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.emitCommentPending(); + t.transition(Data); + break; + default: + t.commentPending.data.append(c); + t.transition(Comment); + } + } + }, + CommentStartDash { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '-': + t.transition(CommentStartDash); + break; + case nullChar: + t.error(this); + t.commentPending.data.append(replacementChar); + t.transition(Comment); + break; + case '>': + t.error(this); + t.emitCommentPending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.emitCommentPending(); + t.transition(Data); + break; + default: + t.commentPending.data.append(c); + t.transition(Comment); + } + } + }, + Comment { + void read(Tokeniser t, CharacterReader r) { + char c = r.current(); + switch (c) { + case '-': + t.advanceTransition(CommentEndDash); + break; + case nullChar: + t.error(this); + r.advance(); + t.commentPending.data.append(replacementChar); + break; + case eof: + t.eofError(this); + t.emitCommentPending(); + t.transition(Data); + break; + default: + t.commentPending.data.append(r.consumeToAny('-', nullChar)); + } + } + }, + CommentEndDash { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '-': + t.transition(CommentEnd); + break; + case nullChar: + t.error(this); + t.commentPending.data.append('-').append(replacementChar); + t.transition(Comment); + break; + case eof: + t.eofError(this); + t.emitCommentPending(); + t.transition(Data); + break; + default: + t.commentPending.data.append('-').append(c); + t.transition(Comment); + } + } + }, + CommentEnd { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '>': + t.emitCommentPending(); + t.transition(Data); + break; + case nullChar: + t.error(this); + t.commentPending.data.append("--").append(replacementChar); + t.transition(Comment); + break; + case '!': + t.error(this); + t.transition(CommentEndBang); + break; + case '-': + t.error(this); + t.commentPending.data.append('-'); + break; + case eof: + t.eofError(this); + t.emitCommentPending(); + t.transition(Data); + break; + default: + t.error(this); + t.commentPending.data.append("--").append(c); + t.transition(Comment); + } + } + }, + CommentEndBang { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '-': + t.commentPending.data.append("--!"); + t.transition(CommentEndDash); + break; + case '>': + t.emitCommentPending(); + t.transition(Data); + break; + case nullChar: + t.error(this); + t.commentPending.data.append("--!").append(replacementChar); + t.transition(Comment); + break; + case eof: + t.eofError(this); + t.emitCommentPending(); + t.transition(Data); + break; + default: + t.commentPending.data.append("--!").append(c); + t.transition(Comment); + } + } + }, + Doctype { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\r': + case '\f': + case ' ': + t.transition(BeforeDoctypeName); + break; + case eof: + t.eofError(this); + // note: fall through to > case + case '>': // catch invalid + t.error(this); + t.createDoctypePending(); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.transition(BeforeDoctypeName); + } + } + }, + BeforeDoctypeName { + void read(Tokeniser t, CharacterReader r) { + if (r.matchesLetter()) { + t.createDoctypePending(); + t.transition(DoctypeName); + return; + } + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\r': + case '\f': + case ' ': + break; // ignore whitespace + case nullChar: + t.error(this); + t.createDoctypePending(); + t.doctypePending.name.append(replacementChar); + t.transition(DoctypeName); + break; + case eof: + t.eofError(this); + t.createDoctypePending(); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.createDoctypePending(); + t.doctypePending.name.append(c); + t.transition(DoctypeName); + } + } + }, + DoctypeName { + void read(Tokeniser t, CharacterReader r) { + if (r.matchesLetter()) { + String name = r.consumeLetterSequence(); + t.doctypePending.name.append(name); + return; + } + char c = r.consume(); + switch (c) { + case '>': + t.emitDoctypePending(); + t.transition(Data); + break; + case '\t': + case '\n': + case '\r': + case '\f': + case ' ': + t.transition(AfterDoctypeName); + break; + case nullChar: + t.error(this); + t.doctypePending.name.append(replacementChar); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.doctypePending.name.append(c); + } + } + }, + AfterDoctypeName { + void read(Tokeniser t, CharacterReader r) { + if (r.isEmpty()) { + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + return; + } + if (r.matchesAny('\t', '\n', '\r', '\f', ' ')) + r.advance(); // ignore whitespace + else if (r.matches('>')) { + t.emitDoctypePending(); + t.advanceTransition(Data); + } else if (r.matchConsumeIgnoreCase(DocumentType.PUBLIC_KEY)) { + t.doctypePending.pubSysKey = DocumentType.PUBLIC_KEY; + t.transition(AfterDoctypePublicKeyword); + } else if (r.matchConsumeIgnoreCase(DocumentType.SYSTEM_KEY)) { + t.doctypePending.pubSysKey = DocumentType.SYSTEM_KEY; + t.transition(AfterDoctypeSystemKeyword); + } else { + t.error(this); + t.doctypePending.forceQuirks = true; + t.advanceTransition(BogusDoctype); + } + + } + }, + AfterDoctypePublicKeyword { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\r': + case '\f': + case ' ': + t.transition(BeforeDoctypePublicIdentifier); + break; + case '"': + t.error(this); + // set public id to empty string + t.transition(DoctypePublicIdentifier_doubleQuoted); + break; + case '\'': + t.error(this); + // set public id to empty string + t.transition(DoctypePublicIdentifier_singleQuoted); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.doctypePending.forceQuirks = true; + t.transition(BogusDoctype); + } + } + }, + BeforeDoctypePublicIdentifier { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\r': + case '\f': + case ' ': + break; + case '"': + // set public id to empty string + t.transition(DoctypePublicIdentifier_doubleQuoted); + break; + case '\'': + // set public id to empty string + t.transition(DoctypePublicIdentifier_singleQuoted); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.doctypePending.forceQuirks = true; + t.transition(BogusDoctype); + } + } + }, + DoctypePublicIdentifier_doubleQuoted { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '"': + t.transition(AfterDoctypePublicIdentifier); + break; + case nullChar: + t.error(this); + t.doctypePending.publicIdentifier.append(replacementChar); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.doctypePending.publicIdentifier.append(c); + } + } + }, + DoctypePublicIdentifier_singleQuoted { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\'': + t.transition(AfterDoctypePublicIdentifier); + break; + case nullChar: + t.error(this); + t.doctypePending.publicIdentifier.append(replacementChar); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.doctypePending.publicIdentifier.append(c); + } + } + }, + AfterDoctypePublicIdentifier { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\r': + case '\f': + case ' ': + t.transition(BetweenDoctypePublicAndSystemIdentifiers); + break; + case '>': + t.emitDoctypePending(); + t.transition(Data); + break; + case '"': + t.error(this); + // system id empty + t.transition(DoctypeSystemIdentifier_doubleQuoted); + break; + case '\'': + t.error(this); + // system id empty + t.transition(DoctypeSystemIdentifier_singleQuoted); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.doctypePending.forceQuirks = true; + t.transition(BogusDoctype); + } + } + }, + BetweenDoctypePublicAndSystemIdentifiers { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\r': + case '\f': + case ' ': + break; + case '>': + t.emitDoctypePending(); + t.transition(Data); + break; + case '"': + t.error(this); + // system id empty + t.transition(DoctypeSystemIdentifier_doubleQuoted); + break; + case '\'': + t.error(this); + // system id empty + t.transition(DoctypeSystemIdentifier_singleQuoted); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.doctypePending.forceQuirks = true; + t.transition(BogusDoctype); + } + } + }, + AfterDoctypeSystemKeyword { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\r': + case '\f': + case ' ': + t.transition(BeforeDoctypeSystemIdentifier); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case '"': + t.error(this); + // system id empty + t.transition(DoctypeSystemIdentifier_doubleQuoted); + break; + case '\'': + t.error(this); + // system id empty + t.transition(DoctypeSystemIdentifier_singleQuoted); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + } + } + }, + BeforeDoctypeSystemIdentifier { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\r': + case '\f': + case ' ': + break; + case '"': + // set system id to empty string + t.transition(DoctypeSystemIdentifier_doubleQuoted); + break; + case '\'': + // set public id to empty string + t.transition(DoctypeSystemIdentifier_singleQuoted); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.doctypePending.forceQuirks = true; + t.transition(BogusDoctype); + } + } + }, + DoctypeSystemIdentifier_doubleQuoted { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '"': + t.transition(AfterDoctypeSystemIdentifier); + break; + case nullChar: + t.error(this); + t.doctypePending.systemIdentifier.append(replacementChar); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.doctypePending.systemIdentifier.append(c); + } + } + }, + DoctypeSystemIdentifier_singleQuoted { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\'': + t.transition(AfterDoctypeSystemIdentifier); + break; + case nullChar: + t.error(this); + t.doctypePending.systemIdentifier.append(replacementChar); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.doctypePending.systemIdentifier.append(c); + } + } + }, + AfterDoctypeSystemIdentifier { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\r': + case '\f': + case ' ': + break; + case '>': + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.transition(BogusDoctype); + // NOT force quirks + } + } + }, + BogusDoctype { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '>': + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.emitDoctypePending(); + t.transition(Data); + break; + default: + // ignore char + break; + } + } + }, + CdataSection { + void read(Tokeniser t, CharacterReader r) { + String data = r.consumeTo("]]>"); + t.dataBuffer.append(data); + if (r.matchConsume("]]>") || r.isEmpty()) { + t.emit(new Token.CData(t.dataBuffer.toString())); + t.transition(Data); + }// otherwise, buffer underrun, stay in data section + } + }; + + + abstract void read(Tokeniser t, CharacterReader r); + + static final char nullChar = '\u0000'; + // char searches. must be sorted, used in inSorted. MUST update TokenisetStateTest if more arrays are added. + static final char[] attributeSingleValueCharsSorted = new char[]{nullChar, '&', '\''}; + static final char[] attributeDoubleValueCharsSorted = new char[]{nullChar, '"', '&'}; + static final char[] attributeNameCharsSorted = new char[]{nullChar, '\t', '\n', '\f', '\r', ' ', '"', '\'', '/', '<', '=', '>'}; + static final char[] attributeValueUnquoted = new char[]{nullChar, '\t', '\n', '\f', '\r', ' ', '"', '&', '\'', '<', '=', '>', '`'}; + + private static final char replacementChar = Tokeniser.replacementChar; + private static final String replacementStr = String.valueOf(Tokeniser.replacementChar); + private static final char eof = CharacterReader.EOF; + + /** + * Handles RawtextEndTagName, ScriptDataEndTagName, and ScriptDataEscapedEndTagName. Same body impl, just + * different else exit transitions. + */ + private static void handleDataEndTag(Tokeniser t, CharacterReader r, TokeniserState elseTransition) { + if (r.matchesLetter()) { + String name = r.consumeLetterSequence(); + t.tagPending.appendTagName(name); + t.dataBuffer.append(name); + return; + } + + boolean needsExitTransition = false; + if (t.isAppropriateEndTagToken() && !r.isEmpty()) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\r': + case '\f': + case ' ': + t.transition(BeforeAttributeName); + break; + case '/': + t.transition(SelfClosingStartTag); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + default: + t.dataBuffer.append(c); + needsExitTransition = true; + } + } else { + needsExitTransition = true; + } + + if (needsExitTransition) { + t.emit("': + if (t.dataBuffer.toString().equals("script")) + t.transition(primary); + else + t.transition(fallback); + t.emit(c); + break; + default: + r.unconsume(); + t.transition(fallback); + } + } +} diff --git a/settings.gradle b/settings.gradle index 29dc38f9..11a2f10c 100644 --- a/settings.gradle +++ b/settings.gradle @@ -1 +1,2 @@ -include ':app', ':library', ':library-image-loader', ':library-view', ':sample-custom-extension', ':library-syntax' +include ':app', ':library', ':library-image-loader', ':library-view', ':sample-custom-extension', + ':library-syntax', ':html-parser-api', ':html-parser-impl'