Added 2 modules: html-parser-api and html-parser-impl

This commit is contained in:
Dimitry Ivanov 2018-08-17 12:53:36 +03:00
parent 7c7b1f59a8
commit ff3bedc37e
24 changed files with 4939 additions and 1 deletions

View File

@ -0,0 +1,31 @@
apply plugin: 'com.android.library'
android {
compileSdkVersion TARGET_SDK
buildToolsVersion BUILD_TOOLS
defaultConfig {
minSdkVersion MIN_SDK
targetSdkVersion TARGET_SDK
versionCode 1
versionName version
}
}
dependencies {
api SUPPORT_ANNOTATIONS
}
afterEvaluate {
generateReleaseBuildConfig.enabled = false
}
// todo: remove `local` check after merge with latest version (1.1.1)
if (hasProperty('release')) {
if (hasProperty('local')) {
ext.RELEASE_REPOSITORY_URL = LOCAL_MAVEN_URL
ext.SNAPSHOT_REPOSITORY_URL = LOCAL_MAVEN_URL
}
apply from: 'https://raw.githubusercontent.com/noties/gradle-mvn-push/master/gradle-mvn-push-aar.gradle'
}

View File

@ -0,0 +1 @@
<manifest package="ru.noties.markwon.html" />

View File

@ -0,0 +1,54 @@
package ru.noties.markwon.html;
import android.support.annotation.NonNull;
import android.support.annotation.Nullable;
import java.util.List;
/**
* @see Inline
* @see Block
*/
public interface HtmlTag {
/**
* @return normalized tag name (lower-case)
*/
@NonNull
String name();
/**
* @return index at which this tag starts
*/
int start();
/**
* @return index at which this tag ends
*/
int end();
/**
* Represents <em>really</em> inline HTML tags (unline commonmark definitions)
*/
interface Inline extends HtmlTag {
}
/**
* Represents HTML block tags. Please note that all tags that are not inline should be
* considered as block tags
*/
interface Block extends HtmlTag {
/**
* @return parent {@link Block} or null if there is no parent (this block is at root level)
*/
@Nullable
Block parent();
/**
* @return list of children
*/
@NonNull
List<Block> children();
}
}

View File

@ -0,0 +1,36 @@
package ru.noties.markwon.html;
import android.support.annotation.NonNull;
import java.util.List;
public abstract class MarkwonHtmlParser {
@NonNull
public static MarkwonHtmlParser noOp() {
return new MarkwonHtmlParserNoOp();
}
public interface FlushAction<T> {
void apply(@NonNull List<T> tags);
}
public abstract <T extends Appendable & CharSequence> void processFragment(
@NonNull T output,
@NonNull String htmlFragment);
// clear all pending tags (if any)
// todo: we also can do this: if supplied value is -1 (for example) we ignore tags that are not closed
public abstract void flushInlineTags(
int documentLength,
@NonNull FlushAction<HtmlTag.Inline> action);
// clear all pending blocks if any
// todo: we also can do this: if supplied value is -1 (for example) we ignore tags that are not closed
public abstract void flushBlockTags(
int documentLength,
@NonNull FlushAction<HtmlTag.Block> action);
public abstract void reset();
}

View File

@ -0,0 +1,26 @@
package ru.noties.markwon.html;
import android.support.annotation.NonNull;
class MarkwonHtmlParserNoOp extends MarkwonHtmlParser {
@Override
public <T extends Appendable & CharSequence> void processFragment(@NonNull T output, @NonNull String htmlFragment) {
}
@Override
public void flushInlineTags(int documentLength, @NonNull FlushAction<HtmlTag.Inline> action) {
}
@Override
public void flushBlockTags(int documentLength, @NonNull FlushAction<HtmlTag.Block> action) {
}
@Override
public void reset() {
}
}

View File

@ -0,0 +1,32 @@
apply plugin: 'com.android.library'
android {
compileSdkVersion TARGET_SDK
buildToolsVersion BUILD_TOOLS
defaultConfig {
minSdkVersion MIN_SDK
targetSdkVersion TARGET_SDK
versionCode 1
versionName version
}
}
dependencies {
api SUPPORT_ANNOTATIONS
api project(':html-parser-api')
}
afterEvaluate {
generateReleaseBuildConfig.enabled = false
}
// todo: remove `local` check after merge with latest version (1.1.1)
if (hasProperty('release')) {
if (hasProperty('local')) {
ext.RELEASE_REPOSITORY_URL = LOCAL_MAVEN_URL
ext.SNAPSHOT_REPOSITORY_URL = LOCAL_MAVEN_URL
}
apply from: 'https://raw.githubusercontent.com/noties/gradle-mvn-push/master/gradle-mvn-push-aar.gradle'
}

View File

@ -0,0 +1 @@
<manifest package="ru.noties.markwon.html" />

View File

@ -0,0 +1,117 @@
package ru.noties.markwon.html;
import android.support.annotation.NonNull;
import android.support.annotation.Nullable;
import java.util.Collections;
import java.util.List;
abstract class HtmlTagImpl implements HtmlTag {
static final int NO_VALUE = -1;
final String name;
final int start;
int end = NO_VALUE;
protected HtmlTagImpl(@NonNull String name, int start) {
this.name = name;
this.start = start;
}
@NonNull
@Override
public String name() {
return name;
}
@Override
public int start() {
return start;
}
@Override
public int end() {
return end;
}
boolean isClosed() {
return end > NO_VALUE;
}
abstract void closeAt(int end);
static class InlineImpl extends HtmlTagImpl implements Inline {
InlineImpl(@NonNull String name, int start) {
super(name, start);
}
@Override
void closeAt(int end) {
if (!isClosed()) {
super.end = end;
}
}
}
static class BlockImpl extends HtmlTagImpl implements Block {
@NonNull
static BlockImpl root() {
//noinspection ConstantConditions
return new BlockImpl("", 0, null);
}
@NonNull
static BlockImpl create(@NonNull String name, int start, @NonNull BlockImpl parent) {
return new BlockImpl(name, start, parent);
}
final BlockImpl parent;
List<BlockImpl> children;
@SuppressWarnings("NullableProblems")
BlockImpl(@NonNull String name, int start, @NonNull BlockImpl parent) {
super(name, start);
this.parent = parent;
}
@Override
void closeAt(int end) {
if (!isClosed()) {
super.end = end;
if (children != null) {
for (BlockImpl child: children) {
child.closeAt(end);
}
children = Collections.unmodifiableList(children);
} else {
children = Collections.emptyList();
}
}
}
boolean isRoot() {
return parent == null;
}
@Nullable
@Override
public Block parent() {
if (parent == null) {
throw new IllegalStateException("#parent() getter was called on the root node " +
"which should not be exposed outside internal usage");
}
return parent;
}
@NonNull
@Override
public List<Block> children() {
//noinspection unchecked
return (List<Block>) (List<? extends Block>) children;
}
}
}

View File

@ -0,0 +1,396 @@
package ru.noties.markwon.html;
import android.support.annotation.NonNull;
import android.support.annotation.Nullable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import ru.noties.markwon.html.HtmlTag.Block;
import ru.noties.markwon.html.HtmlTag.Inline;
import ru.noties.markwon.html.HtmlTagImpl.BlockImpl;
import ru.noties.markwon.html.HtmlTagImpl.InlineImpl;
import ru.noties.markwon.html.jsoup.parser.CharacterReader;
import ru.noties.markwon.html.jsoup.parser.ParseErrorList;
import ru.noties.markwon.html.jsoup.parser.Token;
import ru.noties.markwon.html.jsoup.parser.Tokeniser;
public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
@NonNull
public static MarkwonHtmlParserImpl create() {
return new MarkwonHtmlParserImpl();
}
// https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements
private static final Set<String> INLINE_TAGS;
private static final Set<String> VOID_TAGS;
// these are the tags that are considered _block_ ones
// this parser will ensure that these blocks are started on a new line
// other tags that are NOT inline are considered as block tags, but won't have new line
// inserted before them
// https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
private static final Set<String> BLOCK_TAGS;
private static final String TAG_PARAGRAPH = "p";
private static final String TAG_LIST_ITEM = "li";
// todo: make it configurable
private static final String IMG_REPLACEMENT = "\uFFFC";
static {
INLINE_TAGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
"a", "abbr", "acronym",
"b", "bdo", "big", "br", "button",
"cite", "code",
"dfn",
"em",
"i", "img", "input",
"kbd",
"label",
"map",
"object",
"q",
"samp", "script", "select", "small", "span", "strong", "sub", "sup",
"textarea", "time", "tt",
"var"
)));
VOID_TAGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
"area",
"base", "br",
"col",
"embed",
"hr",
"img", "input",
"keygen",
"link",
"meta",
"param",
"source",
"track",
"wbr"
)));
BLOCK_TAGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
"address", "article", "aside",
"blockquote",
"canvas",
"dd", "div", "dl", "dt",
"fieldset", "figcaption", "figure", "footer", "form",
"h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr",
"li",
"main",
"nav", "noscript",
"ol", "output",
"p", "pre",
"section",
"table", "tfoot",
"ul",
"video"
)));
}
private final List<InlineImpl> inlineTags = new ArrayList<>(0);
private BlockImpl currentBlock = BlockImpl.root();
@Override
public <T extends Appendable & CharSequence> void processFragment(
@NonNull T output,
@NonNull String htmlFragment) {
// todo: maybe there is a way to reuse tokeniser...
final Tokeniser tokeniser = new Tokeniser(new CharacterReader(htmlFragment), ParseErrorList.noTracking());
while (true) {
final Token token = tokeniser.read();
final Token.TokenType tokenType = token.type;
if (Token.TokenType.EOF == tokenType) {
break;
}
switch (tokenType) {
case StartTag: {
final Token.StartTag startTag = (Token.StartTag) token;
if (isInlineTag(startTag.normalName)) {
processInlineTagStart(output, startTag);
} else {
processBlockTagStart(output, startTag);
}
}
break;
case EndTag: {
final Token.EndTag endTag = (Token.EndTag) token;
if (isInlineTag(endTag.normalName)) {
processInlineTagEnd(output, endTag);
} else {
processBlockTagEnd(output, endTag);
}
}
break;
case Character: {
processCharacter(output, ((Token.Character) token));
}
break;
}
// do not forget to reset processed token (even if it's not processed)
token.reset();
}
}
@Override
public void flushInlineTags(int documentLength, @NonNull FlushAction<Inline> action) {
if (inlineTags.size() > 0) {
for (InlineImpl inline : inlineTags) {
inline.closeAt(documentLength);
}
//noinspection unchecked
action.apply(Collections.unmodifiableList((List<? extends Inline>) inlineTags));
inlineTags.clear();
}
}
@Override
public void flushBlockTags(int documentLength, @NonNull FlushAction<Block> action) {
BlockImpl block = currentBlock;
while (!block.isRoot()) {
block = block.parent;
}
block.closeAt(documentLength);
final List<Block> children = block.children();
if (children.size() > 0) {
action.apply(children);
}
currentBlock = BlockImpl.root();
}
@Override
public void reset() {
inlineTags.clear();
currentBlock = BlockImpl.root();
}
protected <T extends Appendable & CharSequence> void processInlineTagStart(
@NonNull T output,
@NonNull Token.StartTag startTag) {
final String name = startTag.normalName;
final InlineImpl inline = new InlineImpl(name, output.length());
if (isVoidTag(name)
|| startTag.selfClosing) {
// check if we have content to append as we must close this tag here
processVoidTag(output, startTag);
inline.end = output.length();
}
// actually only check if there is content for void/self-closing tags
// if none -> ignore it
if (inline.start != inline.end) {
inlineTags.add(inline);
}
}
protected <T extends Appendable & CharSequence> void processInlineTagEnd(
@NonNull T output,
@NonNull Token.EndTag endTag) {
// try to find it, if none found -> ignore
final InlineImpl openInlineTag = findOpenInlineTag(endTag.normalName);
if (openInlineTag != null) {
// close open inline tag
openInlineTag.end = output.length();
}
}
protected <T extends Appendable & CharSequence> void processBlockTagStart(
@NonNull T output,
@NonNull Token.StartTag startTag) {
final String name = startTag.normalName;
// block tags (all that are NOT inline -> blocks
// I think there is only one strong rule -> paragraph cannot contain anything
// except inline tags
// also, closing paragraph with non-closed inlines -> doesn't close inlines
// they are continued for _afterwards_
if (TAG_PARAGRAPH.equals(currentBlock.name)) {
// it must be closed here not matter what we are as here we _assume_
// that it's a block tag
append(output, "\n");
currentBlock.end = output.length();
currentBlock = currentBlock.parent;
} else if (TAG_LIST_ITEM.equals(name)
&& TAG_LIST_ITEM.equals(currentBlock.name)) {
// close previous list item if in the same parent
currentBlock.end = output.length();
currentBlock = currentBlock.parent;
}
if (isBlockTag(name)) {
ensureNewLine(output);
}
final int start = output.length();
final BlockImpl block = BlockImpl.create(name, start, currentBlock);
//noinspection ConstantConditions
appendBlockChild(block.parent, block);
this.currentBlock = block;
}
protected <T extends Appendable & CharSequence> void processBlockTagEnd(
@NonNull T output,
@NonNull Token.EndTag endTag) {
final String name = endTag.normalName;
final BlockImpl block = findOpenBlockTag(endTag.normalName);
if (block != null) {
if (TAG_PARAGRAPH.equals(name)) {
append(output, "\n");
}
block.closeAt(output.length());
this.currentBlock = block.parent;
}
}
protected <T extends Appendable & CharSequence> void processVoidTag(
@NonNull T output,
@NonNull Token.StartTag startTag) {
final String name = startTag.normalName;
if ("br".equals(name)) {
append(output, "\n");
} else if ("img".equals(name)) {
final String alt = startTag.attributes.getIgnoreCase("alt");
if (alt == null
|| alt.length() == 0) {
// no alt is provided
append(output, IMG_REPLACEMENT);
} else {
append(output, alt);
}
}
// other tags are ignored
}
protected <T extends Appendable & CharSequence> void processCharacter(
@NonNull T output,
@NonNull Token.Character character) {
// the thing here is: if it's a script tag that we are inside -> we must not treat this
// as the text to append... should we even care about this? how many people are
// going to include freaking script tags as html inline?
//
// so tags are: BUTTON, INPUT, SELECT, SCRIPT, TEXTAREA
//
// actually we must decide it here: should we append freaking characters for these _bad_
// tags or not, as later we won't be able to change it and/or allow modification (as
// all indexes will be affected with this)
// for now: ignore the inline context
append(output, character.getData());
}
protected void appendBlockChild(@NonNull BlockImpl parent, @NonNull BlockImpl child) {
List<BlockImpl> children = parent.children;
if (children == null) {
children = new ArrayList<>(2);
parent.children = children;
}
children.add(child);
}
@Nullable
protected InlineImpl findOpenInlineTag(@NonNull String name) {
InlineImpl inline;
for (int i = inlineTags.size() - 1; i > -1; i--) {
inline = inlineTags.get(i);
if (name.equals(inline.name)
&& inline.end < 0) {
return inline;
}
}
return null;
}
@Nullable
protected BlockImpl findOpenBlockTag(@NonNull String name) {
BlockImpl blockTag = currentBlock;
while (blockTag != null
&& !name.equals(blockTag.name)) {
blockTag = blockTag.parent;
}
return blockTag;
}
// name here must lower case
protected static boolean isInlineTag(@NonNull String name) {
return INLINE_TAGS.contains(name);
}
protected static boolean isVoidTag(@NonNull String name) {
return VOID_TAGS.contains(name);
}
protected static boolean isBlockTag(@NonNull String name) {
return BLOCK_TAGS.contains(name);
}
protected static void append(@NonNull Appendable appendable, @NonNull CharSequence text) {
try {
appendable.append(text);
} catch (IOException e) {
// _must_ not happen
throw new RuntimeException(e);
}
}
protected static <T extends Appendable & CharSequence> void ensureNewLine(@NonNull T output) {
final int length = output.length();
if (length > 0
&& '\n' != output.charAt(length - 1)) {
append(output, "\n");
}
}
}

View File

@ -0,0 +1,13 @@
package ru.noties.markwon.html.jsoup;
import java.io.IOException;
public class UncheckedIOException extends RuntimeException {
public UncheckedIOException(IOException cause) {
super(cause);
}
public IOException ioException() {
return (IOException) getCause();
}
}

View File

@ -0,0 +1,18 @@
package ru.noties.markwon.html.jsoup.helper;
import java.util.Locale;
/**
* Util methods for normalizing strings. Jsoup internal use only, please don't depend on this API.
*/
public final class Normalizer {
public static String lowerCase(final String input) {
return input != null ? input.toLowerCase(Locale.ENGLISH) : "";
}
public static String normalize(final String input) {
return lowerCase(input).trim();
}
}

View File

@ -0,0 +1,112 @@
package ru.noties.markwon.html.jsoup.helper;
/**
* Simple validation methods. Designed for jsoup internal use
*/
public final class Validate {
private Validate() {}
/**
* Validates that the object is not null
* @param obj object to test
*/
public static void notNull(Object obj) {
if (obj == null)
throw new IllegalArgumentException("Object must not be null");
}
/**
* Validates that the object is not null
* @param obj object to test
* @param msg message to output if validation fails
*/
public static void notNull(Object obj, String msg) {
if (obj == null)
throw new IllegalArgumentException(msg);
}
/**
* Validates that the value is true
* @param val object to test
*/
public static void isTrue(boolean val) {
if (!val)
throw new IllegalArgumentException("Must be true");
}
/**
* Validates that the value is true
* @param val object to test
* @param msg message to output if validation fails
*/
public static void isTrue(boolean val, String msg) {
if (!val)
throw new IllegalArgumentException(msg);
}
/**
* Validates that the value is false
* @param val object to test
*/
public static void isFalse(boolean val) {
if (val)
throw new IllegalArgumentException("Must be false");
}
/**
* Validates that the value is false
* @param val object to test
* @param msg message to output if validation fails
*/
public static void isFalse(boolean val, String msg) {
if (val)
throw new IllegalArgumentException(msg);
}
/**
* Validates that the array contains no null elements
* @param objects the array to test
*/
public static void noNullElements(Object[] objects) {
noNullElements(objects, "Array must not contain any null objects");
}
/**
* Validates that the array contains no null elements
* @param objects the array to test
* @param msg message to output if validation fails
*/
public static void noNullElements(Object[] objects, String msg) {
for (Object obj : objects)
if (obj == null)
throw new IllegalArgumentException(msg);
}
/**
* Validates that the string is not empty
* @param string the string to test
*/
public static void notEmpty(String string) {
if (string == null || string.length() == 0)
throw new IllegalArgumentException("String must not be empty");
}
/**
* Validates that the string is not empty
* @param string the string to test
* @param msg message to output if validation fails
*/
public static void notEmpty(String string, String msg) {
if (string == null || string.length() == 0)
throw new IllegalArgumentException(msg);
}
/**
Cause a failure.
@param msg message to output.
*/
public static void fail(String msg) {
throw new IllegalArgumentException(msg);
}
}

View File

@ -0,0 +1,202 @@
package ru.noties.markwon.html.jsoup.nodes;
import java.util.Map;
import ru.noties.markwon.html.jsoup.helper.Validate;
/**
A single key + value attribute. (Only used for presentation.)
*/
public class Attribute implements Map.Entry<String, String>, Cloneable {
// private static final String[] booleanAttributes = {
// "allowfullscreen", "async", "autofocus", "checked", "compact", "declare", "default", "defer", "disabled",
// "formnovalidate", "hidden", "inert", "ismap", "itemscope", "multiple", "muted", "nohref", "noresize",
// "noshade", "novalidate", "nowrap", "open", "readonly", "required", "reversed", "seamless", "selected",
// "sortable", "truespeed", "typemustmatch"
// };
private String key;
private String val;
Attributes parent; // used to update the holding Attributes when the key / value is changed via this interface
/**
* Create a new attribute from unencoded (raw) key and value.
* @param key attribute key; case is preserved.
* @param value attribute value
*/
public Attribute(String key, String value) {
this(key, value, null);
}
/**
* Create a new attribute from unencoded (raw) key and value.
* @param key attribute key; case is preserved.
* @param val attribute value
* @param parent the containing Attributes (this Attribute is not automatically added to said Attributes)
*/
public Attribute(String key, String val, Attributes parent) {
Validate.notNull(key);
this.key = key.trim();
Validate.notEmpty(key); // trimming could potentially make empty, so validate here
this.val = val;
this.parent = parent;
}
/**
Get the attribute key.
@return the attribute key
*/
public String getKey() {
return key;
}
/**
Set the attribute key; case is preserved.
@param key the new key; must not be null
*/
public void setKey(String key) {
Validate.notNull(key);
key = key.trim();
Validate.notEmpty(key); // trimming could potentially make empty, so validate here
if (parent != null) {
int i = parent.indexOfKey(this.key);
if (i != Attributes.NotFound)
parent.keys[i] = key;
}
this.key = key;
}
/**
Get the attribute value.
@return the attribute value
*/
public String getValue() {
return val;
}
/**
Set the attribute value.
@param val the new attribute value; must not be null
*/
public String setValue(String val) {
String oldVal = parent.get(this.key);
if (parent != null) {
int i = parent.indexOfKey(this.key);
if (i != Attributes.NotFound)
parent.vals[i] = val;
}
this.val = val;
return oldVal;
}
// /**
// Get the HTML representation of this attribute; e.g. {@code href="index.html"}.
// @return HTML
// */
// public String html() {
// StringBuilder accum = new StringBuilder();
//
// try {
// html(accum, (new Document("")).outputSettings());
// } catch(IOException exception) {
// throw new SerializationException(exception);
// }
// return accum.toString();
// }
//
// protected static void html(String key, String val, Appendable accum, Document.OutputSettings out) throws IOException {
// accum.append(key);
// if (!shouldCollapseAttribute(key, val, out)) {
// accum.append("=\"");
// Entities.escape(accum, Attributes.checkNotNull(val) , out, true, false, false);
// accum.append('"');
// }
// }
//
// protected void html(Appendable accum, Document.OutputSettings out) throws IOException {
// html(key, val, accum, out);
// }
// /**
// Get the string representation of this attribute, implemented as {@link #html()}.
// @return string
// */
// @Override
// public String toString() {
// return html();
// }
// /**
// * Create a new Attribute from an unencoded key and a HTML attribute encoded value.
// * @param unencodedKey assumes the key is not encoded, as can be only run of simple \w chars.
// * @param encodedValue HTML attribute encoded value
// * @return attribute
// */
// public static Attribute createFromEncoded(String unencodedKey, String encodedValue) {
// String value = Entities.unescape(encodedValue, true);
// return new Attribute(unencodedKey, value, null); // parent will get set when Put
// }
protected boolean isDataAttribute() {
return isDataAttribute(key);
}
protected static boolean isDataAttribute(String key) {
return key.startsWith(Attributes.dataPrefix) && key.length() > Attributes.dataPrefix.length();
}
// /**
// * Collapsible if it's a boolean attribute and value is empty or same as name
// *
// * @param out output settings
// * @return Returns whether collapsible or not
// */
// protected final boolean shouldCollapseAttribute(Document.OutputSettings out) {
// return shouldCollapseAttribute(key, val, out);
// }
// protected static boolean shouldCollapseAttribute(final String key, final String val, final Document.OutputSettings out) {
// return (
// out.syntax() == Document.OutputSettings.Syntax.html &&
// (val == null || ("".equals(val) || val.equalsIgnoreCase(key)) && Attribute.isBooleanAttribute(key)));
// }
// /**
// * @deprecated
// */
// protected boolean isBooleanAttribute() {
// return Arrays.binarySearch(booleanAttributes, key) >= 0 || val == null;
// }
//
// /**
// * Checks if this attribute name is defined as a boolean attribute in HTML5
// */
// protected static boolean isBooleanAttribute(final String key) {
// return Arrays.binarySearch(booleanAttributes, key) >= 0;
// }
@Override
public boolean equals(Object o) { // note parent not considered
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Attribute attribute = (Attribute) o;
if (key != null ? !key.equals(attribute.key) : attribute.key != null) return false;
return val != null ? val.equals(attribute.val) : attribute.val == null;
}
@Override
public int hashCode() { // note parent not considered
int result = key != null ? key.hashCode() : 0;
result = 31 * result + (val != null ? val.hashCode() : 0);
return result;
}
@Override
public Attribute clone() {
try {
return (Attribute) super.clone();
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,444 @@
package ru.noties.markwon.html.jsoup.nodes;
import java.util.AbstractMap;
import java.util.AbstractSet;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import ru.noties.markwon.html.jsoup.helper.Validate;
import static ru.noties.markwon.html.jsoup.helper.Normalizer.lowerCase;
/**
* The attributes of an Element.
* <p>
* Attributes are treated as a map: there can be only one value associated with an attribute key/name.
* </p>
* <p>
* Attribute name and value comparisons are generally <b>case sensitive</b>. By default for HTML, attribute names are
* normalized to lower-case on parsing. That means you should use lower-case strings when referring to attributes by
* name.
* </p>
*
* @author Jonathan Hedley, jonathan@hedley.net
*/
public class Attributes implements Iterable<Attribute>, Cloneable {
protected static final String dataPrefix = "data-";
private static final int InitialCapacity = 4; // todo - analyze Alexa 1MM sites, determine best setting
// manages the key/val arrays
private static final int GrowthFactor = 2;
private static final String[] Empty = {};
static final int NotFound = -1;
private static final String EmptyString = "";
private int size = 0; // number of slots used (not capacity, which is keys.length
String[] keys = Empty;
String[] vals = Empty;
// check there's room for more
private void checkCapacity(int minNewSize) {
Validate.isTrue(minNewSize >= size);
int curSize = keys.length;
if (curSize >= minNewSize)
return;
int newSize = curSize >= InitialCapacity ? size * GrowthFactor : InitialCapacity;
if (minNewSize > newSize)
newSize = minNewSize;
keys = copyOf(keys, newSize);
vals = copyOf(vals, newSize);
}
// simple implementation of Arrays.copy, for support of Android API 8.
private static String[] copyOf(String[] orig, int size) {
final String[] copy = new String[size];
System.arraycopy(orig, 0, copy, 0,
Math.min(orig.length, size));
return copy;
}
int indexOfKey(String key) {
Validate.notNull(key);
for (int i = 0; i < size; i++) {
if (key.equals(keys[i]))
return i;
}
return NotFound;
}
private int indexOfKeyIgnoreCase(String key) {
Validate.notNull(key);
for (int i = 0; i < size; i++) {
if (key.equalsIgnoreCase(keys[i]))
return i;
}
return NotFound;
}
// we track boolean attributes as null in values - they're just keys. so returns empty for consumers
static String checkNotNull(String val) {
return val == null ? EmptyString : val;
}
/**
Get an attribute value by key.
@param key the (case-sensitive) attribute key
@return the attribute value if set; or empty string if not set (or a boolean attribute).
@see #hasKey(String)
*/
public String get(String key) {
int i = indexOfKey(key);
return i == NotFound ? EmptyString : checkNotNull(vals[i]);
}
/**
* Get an attribute's value by case-insensitive key
* @param key the attribute name
* @return the first matching attribute value if set; or empty string if not set (ora boolean attribute).
*/
public String getIgnoreCase(String key) {
int i = indexOfKeyIgnoreCase(key);
return i == NotFound ? EmptyString : checkNotNull(vals[i]);
}
// adds without checking if this key exists
private void add(String key, String value) {
checkCapacity(size + 1);
keys[size] = key;
vals[size] = value;
size++;
}
/**
* Set a new attribute, or replace an existing one by key.
* @param key case sensitive attribute key
* @param value attribute value
* @return these attributes, for chaining
*/
public Attributes put(String key, String value) {
int i = indexOfKey(key);
if (i != NotFound)
vals[i] = value;
else
add(key, value);
return this;
}
void putIgnoreCase(String key, String value) {
int i = indexOfKeyIgnoreCase(key);
if (i != NotFound) {
vals[i] = value;
if (!keys[i].equals(key)) // case changed, update
keys[i] = key;
}
else
add(key, value);
}
/**
* Set a new boolean attribute, remove attribute if value is false.
* @param key case <b>insensitive</b> attribute key
* @param value attribute value
* @return these attributes, for chaining
*/
public Attributes put(String key, boolean value) {
if (value)
putIgnoreCase(key, null);
else
remove(key);
return this;
}
/**
Set a new attribute, or replace an existing one by key.
@param attribute attribute with case sensitive key
@return these attributes, for chaining
*/
public Attributes put(Attribute attribute) {
Validate.notNull(attribute);
put(attribute.getKey(), attribute.getValue());
attribute.parent = this;
return this;
}
// removes and shifts up
private void remove(int index) {
Validate.isFalse(index >= size);
int shifted = size - index - 1;
if (shifted > 0) {
System.arraycopy(keys, index + 1, keys, index, shifted);
System.arraycopy(vals, index + 1, vals, index, shifted);
}
size--;
keys[size] = null; // release hold
vals[size] = null;
}
/**
Remove an attribute by key. <b>Case sensitive.</b>
@param key attribute key to remove
*/
public void remove(String key) {
int i = indexOfKey(key);
if (i != NotFound)
remove(i);
}
/**
Remove an attribute by key. <b>Case insensitive.</b>
@param key attribute key to remove
*/
public void removeIgnoreCase(String key) {
int i = indexOfKeyIgnoreCase(key);
if (i != NotFound)
remove(i);
}
/**
Tests if these attributes contain an attribute with this key.
@param key case-sensitive key to check for
@return true if key exists, false otherwise
*/
public boolean hasKey(String key) {
return indexOfKey(key) != NotFound;
}
/**
Tests if these attributes contain an attribute with this key.
@param key key to check for
@return true if key exists, false otherwise
*/
public boolean hasKeyIgnoreCase(String key) {
return indexOfKeyIgnoreCase(key) != NotFound;
}
/**
Get the number of attributes in this set.
@return size
*/
public int size() {
return size;
}
/**
Add all the attributes from the incoming set to this set.
@param incoming attributes to add to these attributes.
*/
public void addAll(Attributes incoming) {
if (incoming.size() == 0)
return;
checkCapacity(size + incoming.size);
for (Attribute attr : incoming) {
// todo - should this be case insensitive?
put(attr);
}
}
public Iterator<Attribute> iterator() {
return new Iterator<Attribute>() {
int i = 0;
@Override
public boolean hasNext() {
return i < size;
}
@Override
public Attribute next() {
final Attribute attr = new Attribute(keys[i], vals[i], Attributes.this);
i++;
return attr;
}
@Override
public void remove() {
Attributes.this.remove(--i); // next() advanced, so rewind
}
};
}
/**
Get the attributes as a List, for iteration.
@return an view of the attributes as an unmodifialbe List.
*/
public List<Attribute> asList() {
ArrayList<Attribute> list = new ArrayList<>(size);
for (int i = 0; i < size; i++) {
// Attribute attr = vals[i] == null ?
// new BooleanAttribute(keys[i]) : // deprecated class, but maybe someone still wants it
// new Attribute(keys[i], vals[i], Attributes.this);
// list.add(attr);
list.add(new Attribute(keys[i], vals[i], Attributes.this));
}
return Collections.unmodifiableList(list);
}
/**
* Retrieves a filtered view of attributes that are HTML5 custom data attributes; that is, attributes with keys
* starting with {@code data-}.
* @return map of custom data attributes.
*/
public Map<String, String> dataset() {
return new Dataset(this);
}
// /**
// Get the HTML representation of these attributes.
// @return HTML
// @throws SerializationException if the HTML representation of the attributes cannot be constructed.
// */
// public String html() {
// StringBuilder accum = new StringBuilder();
// try {
// html(accum, (new Document("")).outputSettings()); // output settings a bit funky, but this html() seldom used
// } catch (IOException e) { // ought never happen
// throw new SerializationException(e);
// }
// return accum.toString();
// }
//
// final void html(final Appendable accum, final Document.OutputSettings out) throws IOException {
// final int sz = size;
// for (int i = 0; i < sz; i++) {
// // inlined from Attribute.html()
// final String key = keys[i];
// final String val = vals[i];
// accum.append(' ').append(key);
//
// // collapse checked=null, checked="", checked=checked; write out others
// if (!Attribute.shouldCollapseAttribute(key, val, out)) {
// accum.append("=\"");
// Entities.escape(accum, val == null ? EmptyString : val, out, true, false, false);
// accum.append('"');
// }
// }
// }
//
// @Override
// public String toString() {
// return html();
// }
/**
* Checks if these attributes are equal to another set of attributes, by comparing the two sets
* @param o attributes to compare with
* @return if both sets of attributes have the same content
*/
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Attributes that = (Attributes) o;
if (size != that.size) return false;
if (!Arrays.equals(keys, that.keys)) return false;
return Arrays.equals(vals, that.vals);
}
/**
* Calculates the hashcode of these attributes, by iterating all attributes and summing their hashcodes.
* @return calculated hashcode
*/
@Override
public int hashCode() {
int result = size;
result = 31 * result + Arrays.hashCode(keys);
result = 31 * result + Arrays.hashCode(vals);
return result;
}
@Override
public Attributes clone() {
Attributes clone;
try {
clone = (Attributes) super.clone();
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e);
}
clone.size = size;
keys = copyOf(keys, size);
vals = copyOf(vals, size);
return clone;
}
/**
* Internal method. Lowercases all keys.
*/
public void normalize() {
for (int i = 0; i < size; i++) {
keys[i] = lowerCase(keys[i]);
}
}
private static class Dataset extends AbstractMap<String, String> {
private final Attributes attributes;
private Dataset(Attributes attributes) {
this.attributes = attributes;
}
@Override
public Set<Entry<String, String>> entrySet() {
return new EntrySet();
}
@Override
public String put(String key, String value) {
String dataKey = dataKey(key);
String oldValue = attributes.hasKey(dataKey) ? attributes.get(dataKey) : null;
attributes.put(dataKey, value);
return oldValue;
}
private class EntrySet extends AbstractSet<Map.Entry<String, String>> {
@Override
public Iterator<Map.Entry<String, String>> iterator() {
return new DatasetIterator();
}
@Override
public int size() {
int count = 0;
Iterator iter = new DatasetIterator();
while (iter.hasNext())
count++;
return count;
}
}
private class DatasetIterator implements Iterator<Map.Entry<String, String>> {
private Iterator<Attribute> attrIter = attributes.iterator();
private Attribute attr;
public boolean hasNext() {
while (attrIter.hasNext()) {
attr = attrIter.next();
if (attr.isDataAttribute()) return true;
}
return false;
}
public Entry<String, String> next() {
return new Attribute(attr.getKey().substring(dataPrefix.length()), attr.getValue());
}
public void remove() {
attributes.remove(attr.getKey());
}
}
}
private static String dataKey(String key) {
return dataPrefix + key;
}
}

View File

@ -0,0 +1,104 @@
package ru.noties.markwon.html.jsoup.nodes;
/**
* A {@code <!DOCTYPE>} node.
*/
public class DocumentType /*extends LeafNode*/ {
// todo needs a bit of a chunky cleanup. this level of detail isn't needed
public static final String PUBLIC_KEY = "PUBLIC";
public static final String SYSTEM_KEY = "SYSTEM";
// private static final String NAME = "name";
// private static final String PUB_SYS_KEY = "pubSysKey"; // PUBLIC or SYSTEM
// private static final String PUBLIC_ID = "publicId";
// private static final String SYSTEM_ID = "systemId";
// todo: quirk mode from publicId and systemId
// /**
// * Create a new doctype element.
// * @param name the doctype's name
// * @param publicId the doctype's public ID
// * @param systemId the doctype's system ID
// */
// public DocumentType(String name, String publicId, String systemId) {
// Validate.notNull(name);
// Validate.notNull(publicId);
// Validate.notNull(systemId);
// attr(NAME, name);
// attr(PUBLIC_ID, publicId);
// if (has(PUBLIC_ID)) {
// attr(PUB_SYS_KEY, PUBLIC_KEY);
// }
// attr(SYSTEM_ID, systemId);
// }
//
// /**
// * Create a new doctype element.
// * @param name the doctype's name
// * @param publicId the doctype's public ID
// * @param systemId the doctype's system ID
// * @param baseUri unused
// * @deprecated
// */
// public DocumentType(String name, String publicId, String systemId, String baseUri) {
// attr(NAME, name);
// attr(PUBLIC_ID, publicId);
// if (has(PUBLIC_ID)) {
// attr(PUB_SYS_KEY, PUBLIC_KEY);
// }
// attr(SYSTEM_ID, systemId);
// }
//
// /**
// * Create a new doctype element.
// * @param name the doctype's name
// * @param publicId the doctype's public ID
// * @param systemId the doctype's system ID
// * @param baseUri unused
// * @deprecated
// */
// public DocumentType(String name, String pubSysKey, String publicId, String systemId, String baseUri) {
// attr(NAME, name);
// if (pubSysKey != null) {
// attr(PUB_SYS_KEY, pubSysKey);
// }
// attr(PUBLIC_ID, publicId);
// attr(SYSTEM_ID, systemId);
// }
// public void setPubSysKey(String value) {
// if (value != null)
// attr(PUB_SYS_KEY, value);
// }
//
// @Override
// public String nodeName() {
// return "#doctype";
// }
//
// @Override
// void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
// if (out.syntax() == Syntax.html && !has(PUBLIC_ID) && !has(SYSTEM_ID)) {
// // looks like a html5 doctype, go lowercase for aesthetics
// accum.append("<!doctype");
// } else {
// accum.append("<!DOCTYPE");
// }
// if (has(NAME))
// accum.append(" ").append(attr(NAME));
// if (has(PUB_SYS_KEY))
// accum.append(" ").append(attr(PUB_SYS_KEY));
// if (has(PUBLIC_ID))
// accum.append(" \"").append(attr(PUBLIC_ID)).append('"');
// if (has(SYSTEM_ID))
// accum.append(" \"").append(attr(SYSTEM_ID)).append('"');
// accum.append('>');
// }
//
// @Override
// void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) {
// }
//
// private boolean has(final String attribute) {
// return !StringUtil.isBlank(attr(attribute));
// }
}

View File

@ -0,0 +1,351 @@
package ru.noties.markwon.html.jsoup.nodes;
import java.nio.charset.CharsetEncoder;
import java.util.Arrays;
import java.util.HashMap;
import ru.noties.markwon.html.jsoup.helper.Validate;
import ru.noties.markwon.html.jsoup.parser.CharacterReader;
import static ru.noties.markwon.html.jsoup.nodes.Entities.EscapeMode.base;
import static ru.noties.markwon.html.jsoup.nodes.Entities.EscapeMode.extended;
/**
* HTML entities, and escape routines. Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C
* HTML named character references</a>.
*/
public class Entities {
private static final int empty = -1;
private static final String emptyName = "";
static final int codepointRadix = 36;
private static final char[] codeDelims = {',', ';'};
private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references
// private static final Document.OutputSettings DefaultOutput = new Document.OutputSettings();
public enum EscapeMode {
/**
* Restricted entities suitable for XHTML output: lt, gt, amp, and quot only.
*/
xhtml(EntitiesData.xmlPoints, 4),
/**
* Default HTML output entities.
*/
base(EntitiesData.basePoints, 106),
/**
* Complete HTML entities.
*/
extended(EntitiesData.fullPoints, 2125);
// table of named references to their codepoints. sorted so we can binary search. built by BuildEntities.
private String[] nameKeys;
private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints.
// table of codepoints to named entities.
private int[] codeKeys; // we don' support multicodepoints to single named value currently
private String[] nameVals;
EscapeMode(String file, int size) {
load(this, file, size);
}
int codepointForName(final String name) {
int index = Arrays.binarySearch(nameKeys, name);
return index >= 0 ? codeVals[index] : empty;
}
String nameForCodepoint(final int codepoint) {
final int index = Arrays.binarySearch(codeKeys, codepoint);
if (index >= 0) {
// the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower
// (and binary search for same item with multi results is undefined
return (index < nameVals.length - 1 && codeKeys[index + 1] == codepoint) ?
nameVals[index + 1] : nameVals[index];
}
return emptyName;
}
private int size() {
return nameKeys.length;
}
}
private Entities() {
}
/**
* Check if the input is a known named entity
*
* @param name the possible entity name (e.g. "lt" or "amp")
* @return true if a known named entity
*/
public static boolean isNamedEntity(final String name) {
return extended.codepointForName(name) != empty;
}
/**
* Check if the input is a known named entity in the base entity set.
*
* @param name the possible entity name (e.g. "lt" or "amp")
* @return true if a known named entity in the base set
* @see #isNamedEntity(String)
*/
public static boolean isBaseNamedEntity(final String name) {
return base.codepointForName(name) != empty;
}
/**
* Get the Character value of the named entity
*
* @param name named entity (e.g. "lt" or "amp")
* @return the Character value of the named entity (e.g. '{@literal <}' or '{@literal &}')
* @deprecated does not support characters outside the BMP or multiple character names
*/
public static Character getCharacterByName(String name) {
return (char) extended.codepointForName(name);
}
/**
* Get the character(s) represented by the named entity
*
* @param name entity (e.g. "lt" or "amp")
* @return the string value of the character(s) represented by this entity, or "" if not defined
*/
public static String getByName(String name) {
String val = multipoints.get(name);
if (val != null)
return val;
int codepoint = extended.codepointForName(name);
if (codepoint != empty)
return new String(new int[]{codepoint}, 0, 1);
return emptyName;
}
public static int codepointsForName(final String name, final int[] codepoints) {
String val = multipoints.get(name);
if (val != null) {
codepoints[0] = val.codePointAt(0);
codepoints[1] = val.codePointAt(1);
return 2;
}
int codepoint = extended.codepointForName(name);
if (codepoint != empty) {
codepoints[0] = codepoint;
return 1;
}
return 0;
}
// /**
// * HTML escape an input string. That is, {@code <} is returned as {@code &lt;}
// *
// * @param string the un-escaped string to escape
// * @param out the output settings to use
// * @return the escaped string
// */
// public static String escape(String string, Document.OutputSettings out) {
// if (string == null)
// return "";
// StringBuilder accum = new StringBuilder(string.length() * 2);
// try {
// escape(accum, string, out, false, false, false);
// } catch (IOException e) {
// throw new SerializationException(e); // doesn't happen
// }
// return accum.toString();
// }
// /**
// * HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is returned as
// * {@code &lt;}
// *
// * @param string the un-escaped string to escape
// * @return the escaped string
// */
// public static String escape(String string) {
// return escape(string, DefaultOutput);
// }
//
// // this method is ugly, and does a lot. but other breakups cause rescanning and stringbuilder generations
// static void escape(Appendable accum, String string, Document.OutputSettings out,
// boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite) throws IOException {
//
// boolean lastWasWhite = false;
// boolean reachedNonWhite = false;
// final EscapeMode escapeMode = out.escapeMode();
// final CharsetEncoder encoder = out.encoder();
// final CoreCharset coreCharset = out.coreCharset; // init in out.prepareEncoder()
// final int length = string.length();
//
// int codePoint;
// for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) {
// codePoint = string.codePointAt(offset);
//
// if (normaliseWhite) {
// if (StringUtil.isWhitespace(codePoint)) {
// if ((stripLeadingWhite && !reachedNonWhite) || lastWasWhite)
// continue;
// accum.append(' ');
// lastWasWhite = true;
// continue;
// } else {
// lastWasWhite = false;
// reachedNonWhite = true;
// }
// }
// // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
// if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
// final char c = (char) codePoint;
// // html specific and required escapes:
// switch (c) {
// case '&':
// accum.append("&amp;");
// break;
// case 0xA0:
// if (escapeMode != EscapeMode.xhtml)
// accum.append("&nbsp;");
// else
// accum.append("&#xa0;");
// break;
// case '<':
// // escape when in character data or when in a xml attribue val; not needed in html attr val
// if (!inAttribute || escapeMode == EscapeMode.xhtml)
// accum.append("&lt;");
// else
// accum.append(c);
// break;
// case '>':
// if (!inAttribute)
// accum.append("&gt;");
// else
// accum.append(c);
// break;
// case '"':
// if (inAttribute)
// accum.append("&quot;");
// else
// accum.append(c);
// break;
// default:
// if (canEncode(coreCharset, c, encoder))
// accum.append(c);
// else
// appendEncoded(accum, escapeMode, codePoint);
// }
// } else {
// final String c = new String(Character.toChars(codePoint));
// if (encoder.canEncode(c)) // uses fallback encoder for simplicity
// accum.append(c);
// else
// appendEncoded(accum, escapeMode, codePoint);
// }
// }
// }
// private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException {
// final String name = escapeMode.nameForCodepoint(codePoint);
// if (name != emptyName) // ok for identity check
// accum.append('&').append(name).append(';');
// else
// accum.append("&#x").append(Integer.toHexString(codePoint)).append(';');
// }
// /**
// * Un-escape an HTML escaped string. That is, {@code &lt;} is returned as {@code <}.
// *
// * @param string the HTML string to un-escape
// * @return the unescaped string
// */
// public static String unescape(String string) {
// return unescape(string, false);
// }
// /**
// * Unescape the input string.
// *
// * @param string to un-HTML-escape
// * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional)
// * @return unescaped string
// */
// static String unescape(String string, boolean strict) {
// return Parser.unescapeEntities(string, strict);
// }
/*
* Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean.
* After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF,
* performance may be bad. We can add more encoders for common character sets that are impacted by performance
* issues on Android if required.
*
* Benchmarks: *
* OLD toHtml() impl v New (fastpath) in millis
* Wiki: 1895, 16
* CNN: 6378, 55
* Alterslash: 3013, 28
* Jsoup: 167, 2
*/
private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) {
// todo add more charset tests if impacted by Android's bad perf in canEncode
switch (charset) {
case ascii:
return c < 0x80;
case utf:
return true; // real is:!(Character.isLowSurrogate(c) || Character.isHighSurrogate(c)); - but already check above
default:
return fallback.canEncode(c);
}
}
enum CoreCharset {
ascii, utf, fallback;
static CoreCharset byName(final String name) {
if (name.equals("US-ASCII"))
return ascii;
if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al
return utf;
return fallback;
}
}
private static void load(EscapeMode e, String pointsData, int size) {
e.nameKeys = new String[size];
e.codeVals = new int[size];
e.codeKeys = new int[size];
e.nameVals = new String[size];
int i = 0;
CharacterReader reader = new CharacterReader(pointsData);
while (!reader.isEmpty()) {
// NotNestedLessLess=10913,824;1887&
final String name = reader.consumeTo('=');
reader.advance();
final int cp1 = Integer.parseInt(reader.consumeToAny(codeDelims), codepointRadix);
final char codeDelim = reader.current();
reader.advance();
final int cp2;
if (codeDelim == ',') {
cp2 = Integer.parseInt(reader.consumeTo(';'), codepointRadix);
reader.advance();
} else {
cp2 = empty;
}
final String indexS = reader.consumeTo('&');
final int index = Integer.parseInt(indexS, codepointRadix);
reader.advance();
e.nameKeys[i] = name;
e.codeVals[i] = cp1;
e.codeKeys[index] = cp1;
e.nameVals[index] = name;
if (cp2 != empty) {
multipoints.put(name, new String(new int[]{cp1, cp2}, 0, 2));
}
i++;
}
Validate.isTrue(i == size, "Unexpected count of entities loaded");
}
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,483 @@
package ru.noties.markwon.html.jsoup.parser;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import java.util.Locale;
import ru.noties.markwon.html.jsoup.UncheckedIOException;
import ru.noties.markwon.html.jsoup.helper.Validate;
/**
CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes.
*/
public final class CharacterReader {
static final char EOF = (char) -1;
private static final int maxStringCacheLen = 12;
static final int maxBufferLen = 1024 * 32; // visible for testing
private static final int readAheadLimit = (int) (maxBufferLen * 0.75);
private final char[] charBuf;
private final Reader reader;
private int bufLength;
private int bufSplitPoint;
private int bufPos;
private int readerPos;
private int bufMark;
private final String[] stringCache = new String[512]; // holds reused strings in this doc, to lessen garbage
public CharacterReader(Reader input, int sz) {
Validate.notNull(input);
Validate.isTrue(input.markSupported());
reader = input;
charBuf = new char[sz > maxBufferLen ? maxBufferLen : sz];
bufferUp();
}
public CharacterReader(Reader input) {
this(input, maxBufferLen);
}
public CharacterReader(String input) {
this(new StringReader(input), input.length());
}
private void bufferUp() {
if (bufPos < bufSplitPoint)
return;
try {
reader.skip(bufPos);
reader.mark(maxBufferLen);
final int read = reader.read(charBuf);
reader.reset();
if (read != -1) {
bufLength = read;
readerPos += bufPos;
bufPos = 0;
bufMark = 0;
bufSplitPoint = bufLength > readAheadLimit ? readAheadLimit : bufLength;
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
/**
* Gets the current cursor position in the content.
* @return current position
*/
public int pos() {
return readerPos + bufPos;
}
/**
* Tests if all the content has been read.
* @return true if nothing left to read.
*/
public boolean isEmpty() {
bufferUp();
return bufPos >= bufLength;
}
private boolean isEmptyNoBufferUp() {
return bufPos >= bufLength;
}
/**
* Get the char at the current position.
* @return char
*/
public char current() {
bufferUp();
return isEmptyNoBufferUp() ? EOF : charBuf[bufPos];
}
char consume() {
bufferUp();
char val = isEmptyNoBufferUp() ? EOF : charBuf[bufPos];
bufPos++;
return val;
}
void unconsume() {
bufPos--;
}
/**
* Moves the current position by one.
*/
public void advance() {
bufPos++;
}
void mark() {
bufMark = bufPos;
}
void rewindToMark() {
bufPos = bufMark;
}
/**
* Returns the number of characters between the current position and the next instance of the input char
* @param c scan target
* @return offset between current position and next instance of target. -1 if not found.
*/
int nextIndexOf(char c) {
// doesn't handle scanning for surrogates
bufferUp();
for (int i = bufPos; i < bufLength; i++) {
if (c == charBuf[i])
return i - bufPos;
}
return -1;
}
/**
* Returns the number of characters between the current position and the next instance of the input sequence
*
* @param seq scan target
* @return offset between current position and next instance of target. -1 if not found.
*/
int nextIndexOf(CharSequence seq) {
bufferUp();
// doesn't handle scanning for surrogates
char startChar = seq.charAt(0);
for (int offset = bufPos; offset < bufLength; offset++) {
// scan to first instance of startchar:
if (startChar != charBuf[offset])
while(++offset < bufLength && startChar != charBuf[offset]) { /* empty */ }
int i = offset + 1;
int last = i + seq.length()-1;
if (offset < bufLength && last <= bufLength) {
for (int j = 1; i < last && seq.charAt(j) == charBuf[i]; i++, j++) { /* empty */ }
if (i == last) // found full sequence
return offset - bufPos;
}
}
return -1;
}
/**
* Reads characters up to the specific char.
* @param c the delimiter
* @return the chars read
*/
public String consumeTo(char c) {
int offset = nextIndexOf(c);
if (offset != -1) {
String consumed = cacheString(charBuf, stringCache, bufPos, offset);
bufPos += offset;
return consumed;
} else {
return consumeToEnd();
}
}
String consumeTo(String seq) {
int offset = nextIndexOf(seq);
if (offset != -1) {
String consumed = cacheString(charBuf, stringCache, bufPos, offset);
bufPos += offset;
return consumed;
} else {
return consumeToEnd();
}
}
/**
* Read characters until the first of any delimiters is found.
* @param chars delimiters to scan for
* @return characters read up to the matched delimiter.
*/
public String consumeToAny(final char... chars) {
bufferUp();
final int start = bufPos;
final int remaining = bufLength;
final char[] val = charBuf;
OUTER: while (bufPos < remaining) {
for (char c : chars) {
if (val[bufPos] == c)
break OUTER;
}
bufPos++;
}
return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
}
String consumeToAnySorted(final char... chars) {
bufferUp();
final int start = bufPos;
final int remaining = bufLength;
final char[] val = charBuf;
while (bufPos < remaining) {
if (Arrays.binarySearch(chars, val[bufPos]) >= 0)
break;
bufPos++;
}
return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
}
String consumeData() {
// &, <, null
bufferUp();
final int start = bufPos;
final int remaining = bufLength;
final char[] val = charBuf;
while (bufPos < remaining) {
final char c = val[bufPos];
if (c == '&'|| c == '<' || c == TokeniserState.nullChar)
break;
bufPos++;
}
return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
}
String consumeTagName() {
// '\t', '\n', '\r', '\f', ' ', '/', '>', nullChar
bufferUp();
final int start = bufPos;
final int remaining = bufLength;
final char[] val = charBuf;
while (bufPos < remaining) {
final char c = val[bufPos];
if (c == '\t'|| c == '\n'|| c == '\r'|| c == '\f'|| c == ' '|| c == '/'|| c == '>'|| c == TokeniserState.nullChar)
break;
bufPos++;
}
return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
}
String consumeToEnd() {
bufferUp();
String data = cacheString(charBuf, stringCache, bufPos, bufLength - bufPos);
bufPos = bufLength;
return data;
}
String consumeLetterSequence() {
bufferUp();
int start = bufPos;
while (bufPos < bufLength) {
char c = charBuf[bufPos];
if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c))
bufPos++;
else
break;
}
return cacheString(charBuf, stringCache, start, bufPos - start);
}
String consumeLetterThenDigitSequence() {
bufferUp();
int start = bufPos;
while (bufPos < bufLength) {
char c = charBuf[bufPos];
if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c))
bufPos++;
else
break;
}
while (!isEmptyNoBufferUp()) {
char c = charBuf[bufPos];
if (c >= '0' && c <= '9')
bufPos++;
else
break;
}
return cacheString(charBuf, stringCache, start, bufPos - start);
}
String consumeHexSequence() {
bufferUp();
int start = bufPos;
while (bufPos < bufLength) {
char c = charBuf[bufPos];
if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'))
bufPos++;
else
break;
}
return cacheString(charBuf, stringCache, start, bufPos - start);
}
String consumeDigitSequence() {
bufferUp();
int start = bufPos;
while (bufPos < bufLength) {
char c = charBuf[bufPos];
if (c >= '0' && c <= '9')
bufPos++;
else
break;
}
return cacheString(charBuf, stringCache, start, bufPos - start);
}
boolean matches(char c) {
return !isEmpty() && charBuf[bufPos] == c;
}
boolean matches(String seq) {
bufferUp();
int scanLength = seq.length();
if (scanLength > bufLength - bufPos)
return false;
for (int offset = 0; offset < scanLength; offset++)
if (seq.charAt(offset) != charBuf[bufPos +offset])
return false;
return true;
}
boolean matchesIgnoreCase(String seq) {
bufferUp();
int scanLength = seq.length();
if (scanLength > bufLength - bufPos)
return false;
for (int offset = 0; offset < scanLength; offset++) {
char upScan = Character.toUpperCase(seq.charAt(offset));
char upTarget = Character.toUpperCase(charBuf[bufPos + offset]);
if (upScan != upTarget)
return false;
}
return true;
}
boolean matchesAny(char... seq) {
if (isEmpty())
return false;
bufferUp();
char c = charBuf[bufPos];
for (char seek : seq) {
if (seek == c)
return true;
}
return false;
}
boolean matchesAnySorted(char[] seq) {
bufferUp();
return !isEmpty() && Arrays.binarySearch(seq, charBuf[bufPos]) >= 0;
}
boolean matchesLetter() {
if (isEmpty())
return false;
char c = charBuf[bufPos];
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c);
}
boolean matchesDigit() {
if (isEmpty())
return false;
char c = charBuf[bufPos];
return (c >= '0' && c <= '9');
}
boolean matchConsume(String seq) {
bufferUp();
if (matches(seq)) {
bufPos += seq.length();
return true;
} else {
return false;
}
}
boolean matchConsumeIgnoreCase(String seq) {
if (matchesIgnoreCase(seq)) {
bufPos += seq.length();
return true;
} else {
return false;
}
}
boolean containsIgnoreCase(String seq) {
// used to check presence of </title>, </style>. only finds consistent case.
String loScan = seq.toLowerCase(Locale.ENGLISH);
String hiScan = seq.toUpperCase(Locale.ENGLISH);
return (nextIndexOf(loScan) > -1) || (nextIndexOf(hiScan) > -1);
}
@Override
public String toString() {
return new String(charBuf, bufPos, bufLength - bufPos);
}
/**
* Caches short strings, as a flywheel pattern, to reduce GC load. Just for this doc, to prevent leaks.
* <p />
* Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list.
* That saves both having to create objects as hash keys, and running through the entry list, at the expense of
* some more duplicates.
*/
private static String cacheString(final char[] charBuf, final String[] stringCache, final int start, final int count) {
// limit (no cache):
if (count > maxStringCacheLen)
return new String(charBuf, start, count);
if (count < 1)
return "";
// calculate hash:
int hash = 0;
int offset = start;
for (int i = 0; i < count; i++) {
hash = 31 * hash + charBuf[offset++];
}
// get from cache
final int index = hash & stringCache.length - 1;
String cached = stringCache[index];
if (cached == null) { // miss, add
cached = new String(charBuf, start, count);
stringCache[index] = cached;
} else { // hashcode hit, check equality
if (rangeEquals(charBuf, start, count, cached)) { // hit
return cached;
} else { // hashcode conflict
cached = new String(charBuf, start, count);
stringCache[index] = cached; // update the cache, as recently used strings are more likely to show up again
}
}
return cached;
}
/**
* Check if the value of the provided range equals the string.
*/
static boolean rangeEquals(final char[] charBuf, final int start, int count, final String cached) {
if (count == cached.length()) {
int i = start;
int j = 0;
while (count-- != 0) {
if (charBuf[i++] != cached.charAt(j++))
return false;
}
return true;
}
return false;
}
// just used for testing
boolean rangeEquals(final int start, final int count, final String cached) {
return rangeEquals(charBuf, start, count, cached);
}
}

View File

@ -0,0 +1,41 @@
package ru.noties.markwon.html.jsoup.parser;
/**
* A Parse Error records an error in the input HTML that occurs in either the tokenisation or the tree building phase.
*/
public class ParseError {
private int pos;
private String errorMsg;
ParseError(int pos, String errorMsg) {
this.pos = pos;
this.errorMsg = errorMsg;
}
ParseError(int pos, String errorFormat, Object... args) {
this.errorMsg = String.format(errorFormat, args);
this.pos = pos;
}
/**
* Retrieve the error message.
* @return the error message.
*/
public String getErrorMessage() {
return errorMsg;
}
/**
* Retrieves the offset of the error.
* @return error offset within input
*/
public int getPosition() {
return pos;
}
@Override
public String toString() {
return pos + ": " + errorMsg;
}
}

View File

@ -0,0 +1,34 @@
package ru.noties.markwon.html.jsoup.parser;
import java.util.ArrayList;
/**
* A container for ParseErrors.
*
* @author Jonathan Hedley
*/
public class ParseErrorList extends ArrayList<ParseError>{
private static final int INITIAL_CAPACITY = 16;
private final int maxSize;
ParseErrorList(int initialCapacity, int maxSize) {
super(initialCapacity);
this.maxSize = maxSize;
}
boolean canAddError() {
return size() < maxSize;
}
int getMaxSize() {
return maxSize;
}
public static ParseErrorList noTracking() {
return new ParseErrorList(0, 0);
}
public static ParseErrorList tracking(int maxSize) {
return new ParseErrorList(INITIAL_CAPACITY, maxSize);
}
}

View File

@ -0,0 +1,398 @@
package ru.noties.markwon.html.jsoup.parser;
import android.support.annotation.NonNull;
import ru.noties.markwon.html.jsoup.helper.Validate;
import ru.noties.markwon.html.jsoup.nodes.Attributes;
import static ru.noties.markwon.html.jsoup.helper.Normalizer.lowerCase;
/**
* Parse tokens for the Tokeniser.
*/
public abstract class Token {
public final TokenType type;
protected Token(@NonNull TokenType tokenType) {
this.type = tokenType;
}
// String tokenType() {
// return this.getClass().getSimpleName();
// }
/**
* Reset the data represent by this token, for reuse. Prevents the need to create transfer objects for every
* piece of data, which immediately get GCed.
*/
public abstract Token reset();
static void reset(StringBuilder sb) {
if (sb != null) {
sb.delete(0, sb.length());
}
}
public static final class Doctype extends Token {
final StringBuilder name = new StringBuilder();
String pubSysKey = null;
final StringBuilder publicIdentifier = new StringBuilder();
final StringBuilder systemIdentifier = new StringBuilder();
boolean forceQuirks = false;
Doctype() {
super(TokenType.Doctype);
}
@Override
public Token reset() {
reset(name);
pubSysKey = null;
reset(publicIdentifier);
reset(systemIdentifier);
forceQuirks = false;
return this;
}
String getName() {
return name.toString();
}
String getPubSysKey() {
return pubSysKey;
}
String getPublicIdentifier() {
return publicIdentifier.toString();
}
public String getSystemIdentifier() {
return systemIdentifier.toString();
}
public boolean isForceQuirks() {
return forceQuirks;
}
}
public static abstract class Tag extends Token {
public String tagName;
public String normalName; // lc version of tag name, for case insensitive tree build
private String pendingAttributeName; // attribute names are generally caught in one hop, not accumulated
private StringBuilder pendingAttributeValue = new StringBuilder(); // but values are accumulated, from e.g. & in hrefs
private String pendingAttributeValueS; // try to get attr vals in one shot, vs Builder
private boolean hasEmptyAttributeValue = false; // distinguish boolean attribute from empty string value
private boolean hasPendingAttributeValue = false;
public boolean selfClosing = false;
public Attributes attributes; // start tags get attributes on construction. End tags get attributes on first new attribute (but only for parser convenience, not used).
protected Tag(@NonNull TokenType tokenType) {
super(tokenType);
}
@Override
public Tag reset() {
tagName = null;
normalName = null;
pendingAttributeName = null;
reset(pendingAttributeValue);
pendingAttributeValueS = null;
hasEmptyAttributeValue = false;
hasPendingAttributeValue = false;
selfClosing = false;
attributes = null;
return this;
}
final void newAttribute() {
if (attributes == null)
attributes = new Attributes();
if (pendingAttributeName != null) {
// the tokeniser has skipped whitespace control chars, but trimming could collapse to empty for other control codes, so verify here
pendingAttributeName = pendingAttributeName.trim();
if (pendingAttributeName.length() > 0) {
String value;
if (hasPendingAttributeValue)
value = pendingAttributeValue.length() > 0 ? pendingAttributeValue.toString() : pendingAttributeValueS;
else if (hasEmptyAttributeValue)
value = "";
else
value = null;
attributes.put(pendingAttributeName, value);
}
}
pendingAttributeName = null;
hasEmptyAttributeValue = false;
hasPendingAttributeValue = false;
reset(pendingAttributeValue);
pendingAttributeValueS = null;
}
final void finaliseTag() {
// finalises for emit
if (pendingAttributeName != null) {
// todo: check if attribute name exists; if so, drop and error
newAttribute();
}
}
final String name() { // preserves case, for input into Tag.valueOf (which may drop case)
Validate.isFalse(tagName == null || tagName.length() == 0);
return tagName;
}
final String normalName() { // loses case, used in tree building for working out where in tree it should go
return normalName;
}
final Tag name(String name) {
tagName = name;
normalName = lowerCase(name);
return this;
}
final boolean isSelfClosing() {
return selfClosing;
}
@SuppressWarnings({"TypeMayBeWeakened"})
final Attributes getAttributes() {
return attributes;
}
// these appenders are rarely hit in not null state-- caused by null chars.
final void appendTagName(String append) {
tagName = tagName == null ? append : tagName.concat(append);
normalName = lowerCase(tagName);
}
final void appendTagName(char append) {
appendTagName(String.valueOf(append));
}
final void appendAttributeName(String append) {
pendingAttributeName = pendingAttributeName == null ? append : pendingAttributeName.concat(append);
}
final void appendAttributeName(char append) {
appendAttributeName(String.valueOf(append));
}
final void appendAttributeValue(String append) {
ensureAttributeValue();
if (pendingAttributeValue.length() == 0) {
pendingAttributeValueS = append;
} else {
pendingAttributeValue.append(append);
}
}
final void appendAttributeValue(char append) {
ensureAttributeValue();
pendingAttributeValue.append(append);
}
final void appendAttributeValue(char[] append) {
ensureAttributeValue();
pendingAttributeValue.append(append);
}
final void appendAttributeValue(int[] appendCodepoints) {
ensureAttributeValue();
for (int codepoint : appendCodepoints) {
pendingAttributeValue.appendCodePoint(codepoint);
}
}
final void setEmptyAttributeValue() {
hasEmptyAttributeValue = true;
}
private void ensureAttributeValue() {
hasPendingAttributeValue = true;
// if on second hit, we'll need to move to the builder
if (pendingAttributeValueS != null) {
pendingAttributeValue.append(pendingAttributeValueS);
pendingAttributeValueS = null;
}
}
}
public final static class StartTag extends Tag {
StartTag() {
super(TokenType.StartTag);
attributes = new Attributes();
}
@Override
public Tag reset() {
super.reset();
attributes = new Attributes();
// todo - would prefer these to be null, but need to check Element assertions
return this;
}
StartTag nameAttr(String name, Attributes attributes) {
this.tagName = name;
this.attributes = attributes;
normalName = lowerCase(tagName);
return this;
}
@Override
public String toString() {
if (attributes != null && attributes.size() > 0)
return "<" + name() + " " + attributes.toString() + ">";
else
return "<" + name() + ">";
}
}
public final static class EndTag extends Tag{
EndTag() {
super(TokenType.EndTag);
}
@Override
public String toString() {
return "</" + name() + ">";
}
}
public final static class Comment extends Token {
final StringBuilder data = new StringBuilder();
boolean bogus = false;
@Override
public Token reset() {
reset(data);
bogus = false;
return this;
}
Comment() {
super(TokenType.Comment);
}
String getData() {
return data.toString();
}
@Override
public String toString() {
return "<!--" + getData() + "-->";
}
}
public static class Character extends Token {
private String data;
Character() {
super(TokenType.Character);
}
@Override
public Token reset() {
data = null;
return this;
}
Character data(String data) {
this.data = data;
return this;
}
public String getData() {
return data;
}
@Override
public String toString() {
return getData();
}
}
public final static class CData extends Character {
CData(String data) {
super();
this.data(data);
}
@Override
public String toString() {
return "<![CDATA[" + getData() + "]]>";
}
}
public final static class EOF extends Token {
EOF() {
super(Token.TokenType.EOF);
}
@Override
public Token reset() {
return this;
}
}
// final boolean isDoctype() {
// return type == TokenType.Doctype;
// }
//
// final Doctype asDoctype() {
// return (Doctype) this;
// }
//
// final boolean isStartTag() {
// return type == TokenType.StartTag;
// }
//
// final StartTag asStartTag() {
// return (StartTag) this;
// }
//
// final boolean isEndTag() {
// return type == TokenType.EndTag;
// }
//
// final EndTag asEndTag() {
// return (EndTag) this;
// }
//
// final boolean isComment() {
// return type == TokenType.Comment;
// }
//
// final Comment asComment() {
// return (Comment) this;
// }
//
// final boolean isCharacter() {
// return type == TokenType.Character;
// }
//
// final boolean isCData() {
// return this instanceof CData;
// }
//
// final Character asCharacter() {
// return (Character) this;
// }
//
// final boolean isEOF() {
// return type == TokenType.EOF;
// }
public enum TokenType {
Doctype,
StartTag,
EndTag,
Comment,
Character, // note no CData - treated in builder as an extension of Character
EOF
}
}

View File

@ -0,0 +1,295 @@
package ru.noties.markwon.html.jsoup.parser;
import java.util.Arrays;
import ru.noties.markwon.html.jsoup.helper.Validate;
import ru.noties.markwon.html.jsoup.nodes.Entities;
/**
* Readers the input stream into tokens.
*/
public final class Tokeniser {
static final char replacementChar = '\uFFFD'; // replaces null character
private static final char[] notCharRefCharsSorted = new char[]{'\t', '\n', '\r', '\f', ' ', '<', '&'};
// Some illegal character escapes are parsed by browsers as windows-1252 instead. See issue #1034
// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
static final int win1252ExtensionsStart = 0x80;
static final int[] win1252Extensions = new int[] {
// we could build this manually, but Windows-1252 is not a standard java charset so that could break on
// some platforms - this table is verified with a test
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178,
};
static {
Arrays.sort(notCharRefCharsSorted);
}
private final CharacterReader reader; // html input
private final ParseErrorList errors; // errors found while tokenising
private TokeniserState state = TokeniserState.Data; // current tokenisation state
private Token emitPending; // the token we are about to emit on next read
private boolean isEmitPending = false;
private String charsString = null; // characters pending an emit. Will fall to charsBuilder if more than one
private StringBuilder charsBuilder = new StringBuilder(1024); // buffers characters to output as one token, if more than one emit per read
StringBuilder dataBuffer = new StringBuilder(1024); // buffers data looking for </script>
Token.Tag tagPending; // tag we are building up
Token.StartTag startPending = new Token.StartTag();
Token.EndTag endPending = new Token.EndTag();
Token.Character charPending = new Token.Character();
Token.Doctype doctypePending = new Token.Doctype(); // doctype building up
Token.Comment commentPending = new Token.Comment(); // comment building up
private String lastStartTag; // the last start tag emitted, to test appropriate end tag
public Tokeniser(CharacterReader reader, ParseErrorList errors) {
this.reader = reader;
this.errors = errors;
}
public Token read() {
while (!isEmitPending)
state.read(this, reader);
// if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read:
if (charsBuilder.length() > 0) {
String str = charsBuilder.toString();
charsBuilder.delete(0, charsBuilder.length());
charsString = null;
return charPending.data(str);
} else if (charsString != null) {
Token token = charPending.data(charsString);
charsString = null;
return token;
} else {
isEmitPending = false;
return emitPending;
}
}
void emit(Token token) {
Validate.isFalse(isEmitPending, "There is an unread token pending!");
emitPending = token;
isEmitPending = true;
if (token.type == Token.TokenType.StartTag) {
Token.StartTag startTag = (Token.StartTag) token;
lastStartTag = startTag.tagName;
} else if (token.type == Token.TokenType.EndTag) {
Token.EndTag endTag = (Token.EndTag) token;
if (endTag.attributes != null)
error("Attributes incorrectly present on end tag");
}
}
void emit(final String str) {
// buffer strings up until last string token found, to emit only one token for a run of character refs etc.
// does not set isEmitPending; read checks that
if (charsString == null) {
charsString = str;
}
else {
if (charsBuilder.length() == 0) { // switching to string builder as more than one emit before read
charsBuilder.append(charsString);
}
charsBuilder.append(str);
}
}
void emit(char[] chars) {
emit(String.valueOf(chars));
}
void emit(int[] codepoints) {
emit(new String(codepoints, 0, codepoints.length));
}
void emit(char c) {
emit(String.valueOf(c));
}
TokeniserState getState() {
return state;
}
void transition(TokeniserState state) {
this.state = state;
}
void advanceTransition(TokeniserState state) {
reader.advance();
this.state = state;
}
final private int[] codepointHolder = new int[1]; // holder to not have to keep creating arrays
final private int[] multipointHolder = new int[2];
int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
if (reader.isEmpty())
return null;
if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
return null;
if (reader.matchesAnySorted(notCharRefCharsSorted))
return null;
final int[] codeRef = codepointHolder;
reader.mark();
if (reader.matchConsume("#")) { // numbered
boolean isHexMode = reader.matchConsumeIgnoreCase("X");
String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
if (numRef.length() == 0) { // didn't match anything
characterReferenceError("numeric reference with no numerals");
reader.rewindToMark();
return null;
}
if (!reader.matchConsume(";"))
characterReferenceError("missing semicolon"); // missing semi
int charval = -1;
try {
int base = isHexMode ? 16 : 10;
charval = Integer.valueOf(numRef, base);
} catch (NumberFormatException ignored) {
} // skip
if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
characterReferenceError("character outside of valid range");
codeRef[0] = replacementChar;
return codeRef;
} else {
// fix illegal unicode characters to match browser behavior
if (charval >= win1252ExtensionsStart && charval < win1252ExtensionsStart + win1252Extensions.length) {
characterReferenceError("character is not a valid unicode code point");
charval = win1252Extensions[charval - win1252ExtensionsStart];
}
// todo: implement number replacement table
// todo: check for extra illegal unicode points as parse errors
codeRef[0] = charval;
return codeRef;
}
} else { // named
// get as many letters as possible, and look for matching entities.
String nameRef = reader.consumeLetterThenDigitSequence();
boolean looksLegit = reader.matches(';');
// found if a base named entity without a ;, or an extended entity with the ;.
boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));
if (!found) {
reader.rewindToMark();
if (looksLegit) // named with semicolon
characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
return null;
}
if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
// don't want that to match
reader.rewindToMark();
return null;
}
if (!reader.matchConsume(";"))
characterReferenceError("missing semicolon"); // missing semi
int numChars = Entities.codepointsForName(nameRef, multipointHolder);
if (numChars == 1) {
codeRef[0] = multipointHolder[0];
return codeRef;
} else if (numChars ==2) {
return multipointHolder;
} else {
Validate.fail("Unexpected characters returned for " + nameRef);
return multipointHolder;
}
}
}
Token.Tag createTagPending(boolean start) {
tagPending = start ? startPending.reset() : endPending.reset();
return tagPending;
}
void emitTagPending() {
tagPending.finaliseTag();
emit(tagPending);
}
void createCommentPending() {
commentPending.reset();
}
void emitCommentPending() {
emit(commentPending);
}
void createDoctypePending() {
doctypePending.reset();
}
void emitDoctypePending() {
emit(doctypePending);
}
void createTempBuffer() {
Token.reset(dataBuffer);
}
boolean isAppropriateEndTagToken() {
return lastStartTag != null && tagPending.name().equalsIgnoreCase(lastStartTag);
}
String appropriateEndTagName() {
return lastStartTag; // could be null
}
void error(TokeniserState state) {
if (errors.canAddError())
errors.add(new ParseError(reader.pos(), "Unexpected character '%s' in input state [%s]", reader.current(), state));
}
void eofError(TokeniserState state) {
if (errors.canAddError())
errors.add(new ParseError(reader.pos(), "Unexpectedly reached end of file (EOF) in input state [%s]", state));
}
private void characterReferenceError(String message) {
if (errors.canAddError())
errors.add(new ParseError(reader.pos(), "Invalid character reference: %s", message));
}
void error(String errorMsg) {
if (errors.canAddError())
errors.add(new ParseError(reader.pos(), errorMsg));
}
boolean currentNodeInHtmlNS() {
// todo: implement namespaces correctly
return true;
// Element currentNode = currentNode();
// return currentNode != null && currentNode.namespace().equals("HTML");
}
// /**
// * Utility method to consume reader and unescape entities found within.
// * @param inAttribute if the text to be unescaped is in an attribute
// * @return unescaped string from reader
// */
// String unescapeEntities(boolean inAttribute) {
// StringBuilder builder = StringUtil.stringBuilder();
// while (!reader.isEmpty()) {
// builder.append(reader.consumeTo('&'));
// if (reader.matches('&')) {
// reader.consume();
// int[] c = consumeCharacterReference(null, inAttribute);
// if (c == null || c.length==0)
// builder.append('&');
// else {
// builder.appendCodePoint(c[0]);
// if (c.length == 2)
// builder.appendCodePoint(c[1]);
// }
//
// }
// }
// return builder.toString();
// }
}

View File

@ -1 +1,2 @@
include ':app', ':library', ':library-image-loader', ':library-view', ':sample-custom-extension', ':library-syntax' include ':app', ':library', ':library-image-loader', ':library-view', ':sample-custom-extension',
':library-syntax', ':html-parser-api', ':html-parser-impl'