Added 2 modules: html-parser-api and html-parser-impl
This commit is contained in:
parent
7c7b1f59a8
commit
ff3bedc37e
31
html-parser-api/build.gradle
Normal file
31
html-parser-api/build.gradle
Normal file
@ -0,0 +1,31 @@
|
||||
apply plugin: 'com.android.library'
|
||||
|
||||
android {
|
||||
|
||||
compileSdkVersion TARGET_SDK
|
||||
buildToolsVersion BUILD_TOOLS
|
||||
|
||||
defaultConfig {
|
||||
minSdkVersion MIN_SDK
|
||||
targetSdkVersion TARGET_SDK
|
||||
versionCode 1
|
||||
versionName version
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
api SUPPORT_ANNOTATIONS
|
||||
}
|
||||
|
||||
afterEvaluate {
|
||||
generateReleaseBuildConfig.enabled = false
|
||||
}
|
||||
|
||||
// todo: remove `local` check after merge with latest version (1.1.1)
|
||||
if (hasProperty('release')) {
|
||||
if (hasProperty('local')) {
|
||||
ext.RELEASE_REPOSITORY_URL = LOCAL_MAVEN_URL
|
||||
ext.SNAPSHOT_REPOSITORY_URL = LOCAL_MAVEN_URL
|
||||
}
|
||||
apply from: 'https://raw.githubusercontent.com/noties/gradle-mvn-push/master/gradle-mvn-push-aar.gradle'
|
||||
}
|
1
html-parser-api/src/main/AndroidManifest.xml
Normal file
1
html-parser-api/src/main/AndroidManifest.xml
Normal file
@ -0,0 +1 @@
|
||||
<manifest package="ru.noties.markwon.html" />
|
@ -0,0 +1,54 @@
|
||||
package ru.noties.markwon.html;
|
||||
|
||||
import android.support.annotation.NonNull;
|
||||
import android.support.annotation.Nullable;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @see Inline
|
||||
* @see Block
|
||||
*/
|
||||
public interface HtmlTag {
|
||||
|
||||
/**
|
||||
* @return normalized tag name (lower-case)
|
||||
*/
|
||||
@NonNull
|
||||
String name();
|
||||
|
||||
/**
|
||||
* @return index at which this tag starts
|
||||
*/
|
||||
int start();
|
||||
|
||||
/**
|
||||
* @return index at which this tag ends
|
||||
*/
|
||||
int end();
|
||||
|
||||
/**
|
||||
* Represents <em>really</em> inline HTML tags (unline commonmark definitions)
|
||||
*/
|
||||
interface Inline extends HtmlTag {
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents HTML block tags. Please note that all tags that are not inline should be
|
||||
* considered as block tags
|
||||
*/
|
||||
interface Block extends HtmlTag {
|
||||
|
||||
/**
|
||||
* @return parent {@link Block} or null if there is no parent (this block is at root level)
|
||||
*/
|
||||
@Nullable
|
||||
Block parent();
|
||||
|
||||
/**
|
||||
* @return list of children
|
||||
*/
|
||||
@NonNull
|
||||
List<Block> children();
|
||||
}
|
||||
}
|
@ -0,0 +1,36 @@
|
||||
package ru.noties.markwon.html;
|
||||
|
||||
import android.support.annotation.NonNull;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public abstract class MarkwonHtmlParser {
|
||||
|
||||
@NonNull
|
||||
public static MarkwonHtmlParser noOp() {
|
||||
return new MarkwonHtmlParserNoOp();
|
||||
}
|
||||
|
||||
public interface FlushAction<T> {
|
||||
void apply(@NonNull List<T> tags);
|
||||
}
|
||||
|
||||
public abstract <T extends Appendable & CharSequence> void processFragment(
|
||||
@NonNull T output,
|
||||
@NonNull String htmlFragment);
|
||||
|
||||
// clear all pending tags (if any)
|
||||
// todo: we also can do this: if supplied value is -1 (for example) we ignore tags that are not closed
|
||||
public abstract void flushInlineTags(
|
||||
int documentLength,
|
||||
@NonNull FlushAction<HtmlTag.Inline> action);
|
||||
|
||||
// clear all pending blocks if any
|
||||
// todo: we also can do this: if supplied value is -1 (for example) we ignore tags that are not closed
|
||||
public abstract void flushBlockTags(
|
||||
int documentLength,
|
||||
@NonNull FlushAction<HtmlTag.Block> action);
|
||||
|
||||
public abstract void reset();
|
||||
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
package ru.noties.markwon.html;
|
||||
|
||||
import android.support.annotation.NonNull;
|
||||
|
||||
class MarkwonHtmlParserNoOp extends MarkwonHtmlParser {
|
||||
|
||||
@Override
|
||||
public <T extends Appendable & CharSequence> void processFragment(@NonNull T output, @NonNull String htmlFragment) {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void flushInlineTags(int documentLength, @NonNull FlushAction<HtmlTag.Inline> action) {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void flushBlockTags(int documentLength, @NonNull FlushAction<HtmlTag.Block> action) {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
|
||||
}
|
||||
}
|
32
html-parser-impl/build.gradle
Normal file
32
html-parser-impl/build.gradle
Normal file
@ -0,0 +1,32 @@
|
||||
apply plugin: 'com.android.library'
|
||||
|
||||
android {
|
||||
|
||||
compileSdkVersion TARGET_SDK
|
||||
buildToolsVersion BUILD_TOOLS
|
||||
|
||||
defaultConfig {
|
||||
minSdkVersion MIN_SDK
|
||||
targetSdkVersion TARGET_SDK
|
||||
versionCode 1
|
||||
versionName version
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
api SUPPORT_ANNOTATIONS
|
||||
api project(':html-parser-api')
|
||||
}
|
||||
|
||||
afterEvaluate {
|
||||
generateReleaseBuildConfig.enabled = false
|
||||
}
|
||||
|
||||
// todo: remove `local` check after merge with latest version (1.1.1)
|
||||
if (hasProperty('release')) {
|
||||
if (hasProperty('local')) {
|
||||
ext.RELEASE_REPOSITORY_URL = LOCAL_MAVEN_URL
|
||||
ext.SNAPSHOT_REPOSITORY_URL = LOCAL_MAVEN_URL
|
||||
}
|
||||
apply from: 'https://raw.githubusercontent.com/noties/gradle-mvn-push/master/gradle-mvn-push-aar.gradle'
|
||||
}
|
1
html-parser-impl/src/main/AndroidManifest.xml
Normal file
1
html-parser-impl/src/main/AndroidManifest.xml
Normal file
@ -0,0 +1 @@
|
||||
<manifest package="ru.noties.markwon.html" />
|
@ -0,0 +1,117 @@
|
||||
package ru.noties.markwon.html;
|
||||
|
||||
import android.support.annotation.NonNull;
|
||||
import android.support.annotation.Nullable;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
abstract class HtmlTagImpl implements HtmlTag {
|
||||
|
||||
static final int NO_VALUE = -1;
|
||||
|
||||
final String name;
|
||||
final int start;
|
||||
int end = NO_VALUE;
|
||||
|
||||
protected HtmlTagImpl(@NonNull String name, int start) {
|
||||
this.name = name;
|
||||
this.start = start;
|
||||
}
|
||||
|
||||
@NonNull
|
||||
@Override
|
||||
public String name() {
|
||||
return name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int start() {
|
||||
return start;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int end() {
|
||||
return end;
|
||||
}
|
||||
|
||||
boolean isClosed() {
|
||||
return end > NO_VALUE;
|
||||
}
|
||||
|
||||
abstract void closeAt(int end);
|
||||
|
||||
|
||||
static class InlineImpl extends HtmlTagImpl implements Inline {
|
||||
|
||||
InlineImpl(@NonNull String name, int start) {
|
||||
super(name, start);
|
||||
}
|
||||
|
||||
@Override
|
||||
void closeAt(int end) {
|
||||
if (!isClosed()) {
|
||||
super.end = end;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static class BlockImpl extends HtmlTagImpl implements Block {
|
||||
|
||||
@NonNull
|
||||
static BlockImpl root() {
|
||||
//noinspection ConstantConditions
|
||||
return new BlockImpl("", 0, null);
|
||||
}
|
||||
|
||||
@NonNull
|
||||
static BlockImpl create(@NonNull String name, int start, @NonNull BlockImpl parent) {
|
||||
return new BlockImpl(name, start, parent);
|
||||
}
|
||||
|
||||
final BlockImpl parent;
|
||||
List<BlockImpl> children;
|
||||
|
||||
@SuppressWarnings("NullableProblems")
|
||||
BlockImpl(@NonNull String name, int start, @NonNull BlockImpl parent) {
|
||||
super(name, start);
|
||||
this.parent = parent;
|
||||
}
|
||||
|
||||
@Override
|
||||
void closeAt(int end) {
|
||||
if (!isClosed()) {
|
||||
super.end = end;
|
||||
if (children != null) {
|
||||
for (BlockImpl child: children) {
|
||||
child.closeAt(end);
|
||||
}
|
||||
children = Collections.unmodifiableList(children);
|
||||
} else {
|
||||
children = Collections.emptyList();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
boolean isRoot() {
|
||||
return parent == null;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
@Override
|
||||
public Block parent() {
|
||||
if (parent == null) {
|
||||
throw new IllegalStateException("#parent() getter was called on the root node " +
|
||||
"which should not be exposed outside internal usage");
|
||||
}
|
||||
return parent;
|
||||
}
|
||||
|
||||
@NonNull
|
||||
@Override
|
||||
public List<Block> children() {
|
||||
//noinspection unchecked
|
||||
return (List<Block>) (List<? extends Block>) children;
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,396 @@
|
||||
package ru.noties.markwon.html;
|
||||
|
||||
import android.support.annotation.NonNull;
|
||||
import android.support.annotation.Nullable;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import ru.noties.markwon.html.HtmlTag.Block;
|
||||
import ru.noties.markwon.html.HtmlTag.Inline;
|
||||
import ru.noties.markwon.html.HtmlTagImpl.BlockImpl;
|
||||
import ru.noties.markwon.html.HtmlTagImpl.InlineImpl;
|
||||
import ru.noties.markwon.html.jsoup.parser.CharacterReader;
|
||||
import ru.noties.markwon.html.jsoup.parser.ParseErrorList;
|
||||
import ru.noties.markwon.html.jsoup.parser.Token;
|
||||
import ru.noties.markwon.html.jsoup.parser.Tokeniser;
|
||||
|
||||
public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
|
||||
|
||||
@NonNull
|
||||
public static MarkwonHtmlParserImpl create() {
|
||||
return new MarkwonHtmlParserImpl();
|
||||
}
|
||||
|
||||
// https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements
|
||||
private static final Set<String> INLINE_TAGS;
|
||||
|
||||
private static final Set<String> VOID_TAGS;
|
||||
|
||||
// these are the tags that are considered _block_ ones
|
||||
// this parser will ensure that these blocks are started on a new line
|
||||
// other tags that are NOT inline are considered as block tags, but won't have new line
|
||||
// inserted before them
|
||||
// https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
|
||||
private static final Set<String> BLOCK_TAGS;
|
||||
|
||||
private static final String TAG_PARAGRAPH = "p";
|
||||
private static final String TAG_LIST_ITEM = "li";
|
||||
|
||||
// todo: make it configurable
|
||||
private static final String IMG_REPLACEMENT = "\uFFFC";
|
||||
|
||||
static {
|
||||
INLINE_TAGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
|
||||
"a", "abbr", "acronym",
|
||||
"b", "bdo", "big", "br", "button",
|
||||
"cite", "code",
|
||||
"dfn",
|
||||
"em",
|
||||
"i", "img", "input",
|
||||
"kbd",
|
||||
"label",
|
||||
"map",
|
||||
"object",
|
||||
"q",
|
||||
"samp", "script", "select", "small", "span", "strong", "sub", "sup",
|
||||
"textarea", "time", "tt",
|
||||
"var"
|
||||
)));
|
||||
VOID_TAGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
|
||||
"area",
|
||||
"base", "br",
|
||||
"col",
|
||||
"embed",
|
||||
"hr",
|
||||
"img", "input",
|
||||
"keygen",
|
||||
"link",
|
||||
"meta",
|
||||
"param",
|
||||
"source",
|
||||
"track",
|
||||
"wbr"
|
||||
)));
|
||||
BLOCK_TAGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
|
||||
"address", "article", "aside",
|
||||
"blockquote",
|
||||
"canvas",
|
||||
"dd", "div", "dl", "dt",
|
||||
"fieldset", "figcaption", "figure", "footer", "form",
|
||||
"h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr",
|
||||
"li",
|
||||
"main",
|
||||
"nav", "noscript",
|
||||
"ol", "output",
|
||||
"p", "pre",
|
||||
"section",
|
||||
"table", "tfoot",
|
||||
"ul",
|
||||
"video"
|
||||
)));
|
||||
}
|
||||
|
||||
private final List<InlineImpl> inlineTags = new ArrayList<>(0);
|
||||
|
||||
private BlockImpl currentBlock = BlockImpl.root();
|
||||
|
||||
|
||||
@Override
|
||||
public <T extends Appendable & CharSequence> void processFragment(
|
||||
@NonNull T output,
|
||||
@NonNull String htmlFragment) {
|
||||
|
||||
// todo: maybe there is a way to reuse tokeniser...
|
||||
final Tokeniser tokeniser = new Tokeniser(new CharacterReader(htmlFragment), ParseErrorList.noTracking());
|
||||
|
||||
while (true) {
|
||||
|
||||
final Token token = tokeniser.read();
|
||||
final Token.TokenType tokenType = token.type;
|
||||
|
||||
if (Token.TokenType.EOF == tokenType) {
|
||||
break;
|
||||
}
|
||||
|
||||
switch (tokenType) {
|
||||
|
||||
case StartTag: {
|
||||
|
||||
final Token.StartTag startTag = (Token.StartTag) token;
|
||||
|
||||
if (isInlineTag(startTag.normalName)) {
|
||||
processInlineTagStart(output, startTag);
|
||||
} else {
|
||||
processBlockTagStart(output, startTag);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case EndTag: {
|
||||
|
||||
final Token.EndTag endTag = (Token.EndTag) token;
|
||||
|
||||
if (isInlineTag(endTag.normalName)) {
|
||||
processInlineTagEnd(output, endTag);
|
||||
} else {
|
||||
processBlockTagEnd(output, endTag);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case Character: {
|
||||
processCharacter(output, ((Token.Character) token));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// do not forget to reset processed token (even if it's not processed)
|
||||
token.reset();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void flushInlineTags(int documentLength, @NonNull FlushAction<Inline> action) {
|
||||
if (inlineTags.size() > 0) {
|
||||
for (InlineImpl inline : inlineTags) {
|
||||
inline.closeAt(documentLength);
|
||||
}
|
||||
//noinspection unchecked
|
||||
action.apply(Collections.unmodifiableList((List<? extends Inline>) inlineTags));
|
||||
inlineTags.clear();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void flushBlockTags(int documentLength, @NonNull FlushAction<Block> action) {
|
||||
|
||||
BlockImpl block = currentBlock;
|
||||
while (!block.isRoot()) {
|
||||
block = block.parent;
|
||||
}
|
||||
|
||||
block.closeAt(documentLength);
|
||||
|
||||
final List<Block> children = block.children();
|
||||
if (children.size() > 0) {
|
||||
action.apply(children);
|
||||
}
|
||||
|
||||
currentBlock = BlockImpl.root();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
inlineTags.clear();
|
||||
currentBlock = BlockImpl.root();
|
||||
}
|
||||
|
||||
|
||||
protected <T extends Appendable & CharSequence> void processInlineTagStart(
|
||||
@NonNull T output,
|
||||
@NonNull Token.StartTag startTag) {
|
||||
|
||||
final String name = startTag.normalName;
|
||||
|
||||
final InlineImpl inline = new InlineImpl(name, output.length());
|
||||
|
||||
if (isVoidTag(name)
|
||||
|| startTag.selfClosing) {
|
||||
|
||||
// check if we have content to append as we must close this tag here
|
||||
processVoidTag(output, startTag);
|
||||
|
||||
inline.end = output.length();
|
||||
}
|
||||
|
||||
// actually only check if there is content for void/self-closing tags
|
||||
// if none -> ignore it
|
||||
if (inline.start != inline.end) {
|
||||
inlineTags.add(inline);
|
||||
}
|
||||
}
|
||||
|
||||
protected <T extends Appendable & CharSequence> void processInlineTagEnd(
|
||||
@NonNull T output,
|
||||
@NonNull Token.EndTag endTag) {
|
||||
|
||||
// try to find it, if none found -> ignore
|
||||
final InlineImpl openInlineTag = findOpenInlineTag(endTag.normalName);
|
||||
if (openInlineTag != null) {
|
||||
// close open inline tag
|
||||
openInlineTag.end = output.length();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
protected <T extends Appendable & CharSequence> void processBlockTagStart(
|
||||
@NonNull T output,
|
||||
@NonNull Token.StartTag startTag) {
|
||||
|
||||
final String name = startTag.normalName;
|
||||
|
||||
// block tags (all that are NOT inline -> blocks
|
||||
// I think there is only one strong rule -> paragraph cannot contain anything
|
||||
// except inline tags
|
||||
// also, closing paragraph with non-closed inlines -> doesn't close inlines
|
||||
// they are continued for _afterwards_
|
||||
|
||||
if (TAG_PARAGRAPH.equals(currentBlock.name)) {
|
||||
// it must be closed here not matter what we are as here we _assume_
|
||||
// that it's a block tag
|
||||
append(output, "\n");
|
||||
currentBlock.end = output.length();
|
||||
currentBlock = currentBlock.parent;
|
||||
} else if (TAG_LIST_ITEM.equals(name)
|
||||
&& TAG_LIST_ITEM.equals(currentBlock.name)) {
|
||||
// close previous list item if in the same parent
|
||||
currentBlock.end = output.length();
|
||||
currentBlock = currentBlock.parent;
|
||||
}
|
||||
|
||||
if (isBlockTag(name)) {
|
||||
ensureNewLine(output);
|
||||
}
|
||||
|
||||
final int start = output.length();
|
||||
|
||||
final BlockImpl block = BlockImpl.create(name, start, currentBlock);
|
||||
|
||||
//noinspection ConstantConditions
|
||||
appendBlockChild(block.parent, block);
|
||||
|
||||
this.currentBlock = block;
|
||||
}
|
||||
|
||||
protected <T extends Appendable & CharSequence> void processBlockTagEnd(
|
||||
@NonNull T output,
|
||||
@NonNull Token.EndTag endTag) {
|
||||
|
||||
final String name = endTag.normalName;
|
||||
|
||||
final BlockImpl block = findOpenBlockTag(endTag.normalName);
|
||||
if (block != null) {
|
||||
|
||||
if (TAG_PARAGRAPH.equals(name)) {
|
||||
append(output, "\n");
|
||||
}
|
||||
|
||||
block.closeAt(output.length());
|
||||
this.currentBlock = block.parent;
|
||||
}
|
||||
}
|
||||
|
||||
protected <T extends Appendable & CharSequence> void processVoidTag(
|
||||
@NonNull T output,
|
||||
@NonNull Token.StartTag startTag) {
|
||||
|
||||
final String name = startTag.normalName;
|
||||
|
||||
if ("br".equals(name)) {
|
||||
append(output, "\n");
|
||||
} else if ("img".equals(name)) {
|
||||
final String alt = startTag.attributes.getIgnoreCase("alt");
|
||||
if (alt == null
|
||||
|| alt.length() == 0) {
|
||||
// no alt is provided
|
||||
append(output, IMG_REPLACEMENT);
|
||||
} else {
|
||||
append(output, alt);
|
||||
}
|
||||
}
|
||||
|
||||
// other tags are ignored
|
||||
}
|
||||
|
||||
protected <T extends Appendable & CharSequence> void processCharacter(
|
||||
@NonNull T output,
|
||||
@NonNull Token.Character character) {
|
||||
|
||||
// the thing here is: if it's a script tag that we are inside -> we must not treat this
|
||||
// as the text to append... should we even care about this? how many people are
|
||||
// going to include freaking script tags as html inline?
|
||||
//
|
||||
// so tags are: BUTTON, INPUT, SELECT, SCRIPT, TEXTAREA
|
||||
//
|
||||
// actually we must decide it here: should we append freaking characters for these _bad_
|
||||
// tags or not, as later we won't be able to change it and/or allow modification (as
|
||||
// all indexes will be affected with this)
|
||||
|
||||
// for now: ignore the inline context
|
||||
append(output, character.getData());
|
||||
}
|
||||
|
||||
protected void appendBlockChild(@NonNull BlockImpl parent, @NonNull BlockImpl child) {
|
||||
List<BlockImpl> children = parent.children;
|
||||
if (children == null) {
|
||||
children = new ArrayList<>(2);
|
||||
parent.children = children;
|
||||
}
|
||||
children.add(child);
|
||||
}
|
||||
|
||||
@Nullable
|
||||
protected InlineImpl findOpenInlineTag(@NonNull String name) {
|
||||
|
||||
InlineImpl inline;
|
||||
|
||||
for (int i = inlineTags.size() - 1; i > -1; i--) {
|
||||
inline = inlineTags.get(i);
|
||||
if (name.equals(inline.name)
|
||||
&& inline.end < 0) {
|
||||
return inline;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
protected BlockImpl findOpenBlockTag(@NonNull String name) {
|
||||
|
||||
BlockImpl blockTag = currentBlock;
|
||||
|
||||
while (blockTag != null
|
||||
&& !name.equals(blockTag.name)) {
|
||||
blockTag = blockTag.parent;
|
||||
}
|
||||
|
||||
return blockTag;
|
||||
}
|
||||
|
||||
// name here must lower case
|
||||
protected static boolean isInlineTag(@NonNull String name) {
|
||||
return INLINE_TAGS.contains(name);
|
||||
}
|
||||
|
||||
protected static boolean isVoidTag(@NonNull String name) {
|
||||
return VOID_TAGS.contains(name);
|
||||
}
|
||||
|
||||
protected static boolean isBlockTag(@NonNull String name) {
|
||||
return BLOCK_TAGS.contains(name);
|
||||
}
|
||||
|
||||
protected static void append(@NonNull Appendable appendable, @NonNull CharSequence text) {
|
||||
try {
|
||||
appendable.append(text);
|
||||
} catch (IOException e) {
|
||||
// _must_ not happen
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
protected static <T extends Appendable & CharSequence> void ensureNewLine(@NonNull T output) {
|
||||
final int length = output.length();
|
||||
if (length > 0
|
||||
&& '\n' != output.charAt(length - 1)) {
|
||||
append(output, "\n");
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,13 @@
|
||||
package ru.noties.markwon.html.jsoup;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class UncheckedIOException extends RuntimeException {
|
||||
public UncheckedIOException(IOException cause) {
|
||||
super(cause);
|
||||
}
|
||||
|
||||
public IOException ioException() {
|
||||
return (IOException) getCause();
|
||||
}
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
package ru.noties.markwon.html.jsoup.helper;
|
||||
|
||||
import java.util.Locale;
|
||||
|
||||
/**
|
||||
* Util methods for normalizing strings. Jsoup internal use only, please don't depend on this API.
|
||||
*/
|
||||
public final class Normalizer {
|
||||
|
||||
public static String lowerCase(final String input) {
|
||||
return input != null ? input.toLowerCase(Locale.ENGLISH) : "";
|
||||
}
|
||||
|
||||
public static String normalize(final String input) {
|
||||
return lowerCase(input).trim();
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,112 @@
|
||||
package ru.noties.markwon.html.jsoup.helper;
|
||||
|
||||
/**
|
||||
* Simple validation methods. Designed for jsoup internal use
|
||||
*/
|
||||
public final class Validate {
|
||||
|
||||
private Validate() {}
|
||||
|
||||
/**
|
||||
* Validates that the object is not null
|
||||
* @param obj object to test
|
||||
*/
|
||||
public static void notNull(Object obj) {
|
||||
if (obj == null)
|
||||
throw new IllegalArgumentException("Object must not be null");
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates that the object is not null
|
||||
* @param obj object to test
|
||||
* @param msg message to output if validation fails
|
||||
*/
|
||||
public static void notNull(Object obj, String msg) {
|
||||
if (obj == null)
|
||||
throw new IllegalArgumentException(msg);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates that the value is true
|
||||
* @param val object to test
|
||||
*/
|
||||
public static void isTrue(boolean val) {
|
||||
if (!val)
|
||||
throw new IllegalArgumentException("Must be true");
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates that the value is true
|
||||
* @param val object to test
|
||||
* @param msg message to output if validation fails
|
||||
*/
|
||||
public static void isTrue(boolean val, String msg) {
|
||||
if (!val)
|
||||
throw new IllegalArgumentException(msg);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates that the value is false
|
||||
* @param val object to test
|
||||
*/
|
||||
public static void isFalse(boolean val) {
|
||||
if (val)
|
||||
throw new IllegalArgumentException("Must be false");
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates that the value is false
|
||||
* @param val object to test
|
||||
* @param msg message to output if validation fails
|
||||
*/
|
||||
public static void isFalse(boolean val, String msg) {
|
||||
if (val)
|
||||
throw new IllegalArgumentException(msg);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates that the array contains no null elements
|
||||
* @param objects the array to test
|
||||
*/
|
||||
public static void noNullElements(Object[] objects) {
|
||||
noNullElements(objects, "Array must not contain any null objects");
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates that the array contains no null elements
|
||||
* @param objects the array to test
|
||||
* @param msg message to output if validation fails
|
||||
*/
|
||||
public static void noNullElements(Object[] objects, String msg) {
|
||||
for (Object obj : objects)
|
||||
if (obj == null)
|
||||
throw new IllegalArgumentException(msg);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates that the string is not empty
|
||||
* @param string the string to test
|
||||
*/
|
||||
public static void notEmpty(String string) {
|
||||
if (string == null || string.length() == 0)
|
||||
throw new IllegalArgumentException("String must not be empty");
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates that the string is not empty
|
||||
* @param string the string to test
|
||||
* @param msg message to output if validation fails
|
||||
*/
|
||||
public static void notEmpty(String string, String msg) {
|
||||
if (string == null || string.length() == 0)
|
||||
throw new IllegalArgumentException(msg);
|
||||
}
|
||||
|
||||
/**
|
||||
Cause a failure.
|
||||
@param msg message to output.
|
||||
*/
|
||||
public static void fail(String msg) {
|
||||
throw new IllegalArgumentException(msg);
|
||||
}
|
||||
}
|
@ -0,0 +1,202 @@
|
||||
package ru.noties.markwon.html.jsoup.nodes;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import ru.noties.markwon.html.jsoup.helper.Validate;
|
||||
|
||||
/**
|
||||
A single key + value attribute. (Only used for presentation.)
|
||||
*/
|
||||
public class Attribute implements Map.Entry<String, String>, Cloneable {
|
||||
// private static final String[] booleanAttributes = {
|
||||
// "allowfullscreen", "async", "autofocus", "checked", "compact", "declare", "default", "defer", "disabled",
|
||||
// "formnovalidate", "hidden", "inert", "ismap", "itemscope", "multiple", "muted", "nohref", "noresize",
|
||||
// "noshade", "novalidate", "nowrap", "open", "readonly", "required", "reversed", "seamless", "selected",
|
||||
// "sortable", "truespeed", "typemustmatch"
|
||||
// };
|
||||
|
||||
private String key;
|
||||
private String val;
|
||||
Attributes parent; // used to update the holding Attributes when the key / value is changed via this interface
|
||||
|
||||
/**
|
||||
* Create a new attribute from unencoded (raw) key and value.
|
||||
* @param key attribute key; case is preserved.
|
||||
* @param value attribute value
|
||||
*/
|
||||
public Attribute(String key, String value) {
|
||||
this(key, value, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new attribute from unencoded (raw) key and value.
|
||||
* @param key attribute key; case is preserved.
|
||||
* @param val attribute value
|
||||
* @param parent the containing Attributes (this Attribute is not automatically added to said Attributes)
|
||||
*/
|
||||
public Attribute(String key, String val, Attributes parent) {
|
||||
Validate.notNull(key);
|
||||
this.key = key.trim();
|
||||
Validate.notEmpty(key); // trimming could potentially make empty, so validate here
|
||||
this.val = val;
|
||||
this.parent = parent;
|
||||
}
|
||||
|
||||
/**
|
||||
Get the attribute key.
|
||||
@return the attribute key
|
||||
*/
|
||||
public String getKey() {
|
||||
return key;
|
||||
}
|
||||
|
||||
/**
|
||||
Set the attribute key; case is preserved.
|
||||
@param key the new key; must not be null
|
||||
*/
|
||||
public void setKey(String key) {
|
||||
Validate.notNull(key);
|
||||
key = key.trim();
|
||||
Validate.notEmpty(key); // trimming could potentially make empty, so validate here
|
||||
if (parent != null) {
|
||||
int i = parent.indexOfKey(this.key);
|
||||
if (i != Attributes.NotFound)
|
||||
parent.keys[i] = key;
|
||||
}
|
||||
this.key = key;
|
||||
}
|
||||
|
||||
/**
|
||||
Get the attribute value.
|
||||
@return the attribute value
|
||||
*/
|
||||
public String getValue() {
|
||||
return val;
|
||||
}
|
||||
|
||||
/**
|
||||
Set the attribute value.
|
||||
@param val the new attribute value; must not be null
|
||||
*/
|
||||
public String setValue(String val) {
|
||||
String oldVal = parent.get(this.key);
|
||||
if (parent != null) {
|
||||
int i = parent.indexOfKey(this.key);
|
||||
if (i != Attributes.NotFound)
|
||||
parent.vals[i] = val;
|
||||
}
|
||||
this.val = val;
|
||||
return oldVal;
|
||||
}
|
||||
|
||||
// /**
|
||||
// Get the HTML representation of this attribute; e.g. {@code href="index.html"}.
|
||||
// @return HTML
|
||||
// */
|
||||
// public String html() {
|
||||
// StringBuilder accum = new StringBuilder();
|
||||
//
|
||||
// try {
|
||||
// html(accum, (new Document("")).outputSettings());
|
||||
// } catch(IOException exception) {
|
||||
// throw new SerializationException(exception);
|
||||
// }
|
||||
// return accum.toString();
|
||||
// }
|
||||
//
|
||||
// protected static void html(String key, String val, Appendable accum, Document.OutputSettings out) throws IOException {
|
||||
// accum.append(key);
|
||||
// if (!shouldCollapseAttribute(key, val, out)) {
|
||||
// accum.append("=\"");
|
||||
// Entities.escape(accum, Attributes.checkNotNull(val) , out, true, false, false);
|
||||
// accum.append('"');
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// protected void html(Appendable accum, Document.OutputSettings out) throws IOException {
|
||||
// html(key, val, accum, out);
|
||||
// }
|
||||
|
||||
// /**
|
||||
// Get the string representation of this attribute, implemented as {@link #html()}.
|
||||
// @return string
|
||||
// */
|
||||
// @Override
|
||||
// public String toString() {
|
||||
// return html();
|
||||
// }
|
||||
|
||||
// /**
|
||||
// * Create a new Attribute from an unencoded key and a HTML attribute encoded value.
|
||||
// * @param unencodedKey assumes the key is not encoded, as can be only run of simple \w chars.
|
||||
// * @param encodedValue HTML attribute encoded value
|
||||
// * @return attribute
|
||||
// */
|
||||
// public static Attribute createFromEncoded(String unencodedKey, String encodedValue) {
|
||||
// String value = Entities.unescape(encodedValue, true);
|
||||
// return new Attribute(unencodedKey, value, null); // parent will get set when Put
|
||||
// }
|
||||
|
||||
protected boolean isDataAttribute() {
|
||||
return isDataAttribute(key);
|
||||
}
|
||||
|
||||
protected static boolean isDataAttribute(String key) {
|
||||
return key.startsWith(Attributes.dataPrefix) && key.length() > Attributes.dataPrefix.length();
|
||||
}
|
||||
|
||||
// /**
|
||||
// * Collapsible if it's a boolean attribute and value is empty or same as name
|
||||
// *
|
||||
// * @param out output settings
|
||||
// * @return Returns whether collapsible or not
|
||||
// */
|
||||
// protected final boolean shouldCollapseAttribute(Document.OutputSettings out) {
|
||||
// return shouldCollapseAttribute(key, val, out);
|
||||
// }
|
||||
|
||||
// protected static boolean shouldCollapseAttribute(final String key, final String val, final Document.OutputSettings out) {
|
||||
// return (
|
||||
// out.syntax() == Document.OutputSettings.Syntax.html &&
|
||||
// (val == null || ("".equals(val) || val.equalsIgnoreCase(key)) && Attribute.isBooleanAttribute(key)));
|
||||
// }
|
||||
|
||||
// /**
|
||||
// * @deprecated
|
||||
// */
|
||||
// protected boolean isBooleanAttribute() {
|
||||
// return Arrays.binarySearch(booleanAttributes, key) >= 0 || val == null;
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * Checks if this attribute name is defined as a boolean attribute in HTML5
|
||||
// */
|
||||
// protected static boolean isBooleanAttribute(final String key) {
|
||||
// return Arrays.binarySearch(booleanAttributes, key) >= 0;
|
||||
// }
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) { // note parent not considered
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
Attribute attribute = (Attribute) o;
|
||||
if (key != null ? !key.equals(attribute.key) : attribute.key != null) return false;
|
||||
return val != null ? val.equals(attribute.val) : attribute.val == null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() { // note parent not considered
|
||||
int result = key != null ? key.hashCode() : 0;
|
||||
result = 31 * result + (val != null ? val.hashCode() : 0);
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Attribute clone() {
|
||||
try {
|
||||
return (Attribute) super.clone();
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,444 @@
|
||||
package ru.noties.markwon.html.jsoup.nodes;
|
||||
|
||||
import java.util.AbstractMap;
|
||||
import java.util.AbstractSet;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import ru.noties.markwon.html.jsoup.helper.Validate;
|
||||
|
||||
import static ru.noties.markwon.html.jsoup.helper.Normalizer.lowerCase;
|
||||
|
||||
/**
|
||||
* The attributes of an Element.
|
||||
* <p>
|
||||
* Attributes are treated as a map: there can be only one value associated with an attribute key/name.
|
||||
* </p>
|
||||
* <p>
|
||||
* Attribute name and value comparisons are generally <b>case sensitive</b>. By default for HTML, attribute names are
|
||||
* normalized to lower-case on parsing. That means you should use lower-case strings when referring to attributes by
|
||||
* name.
|
||||
* </p>
|
||||
*
|
||||
* @author Jonathan Hedley, jonathan@hedley.net
|
||||
*/
|
||||
public class Attributes implements Iterable<Attribute>, Cloneable {
|
||||
protected static final String dataPrefix = "data-";
|
||||
private static final int InitialCapacity = 4; // todo - analyze Alexa 1MM sites, determine best setting
|
||||
|
||||
// manages the key/val arrays
|
||||
private static final int GrowthFactor = 2;
|
||||
private static final String[] Empty = {};
|
||||
static final int NotFound = -1;
|
||||
private static final String EmptyString = "";
|
||||
|
||||
private int size = 0; // number of slots used (not capacity, which is keys.length
|
||||
String[] keys = Empty;
|
||||
String[] vals = Empty;
|
||||
|
||||
// check there's room for more
|
||||
private void checkCapacity(int minNewSize) {
|
||||
Validate.isTrue(minNewSize >= size);
|
||||
int curSize = keys.length;
|
||||
if (curSize >= minNewSize)
|
||||
return;
|
||||
|
||||
int newSize = curSize >= InitialCapacity ? size * GrowthFactor : InitialCapacity;
|
||||
if (minNewSize > newSize)
|
||||
newSize = minNewSize;
|
||||
|
||||
keys = copyOf(keys, newSize);
|
||||
vals = copyOf(vals, newSize);
|
||||
}
|
||||
|
||||
// simple implementation of Arrays.copy, for support of Android API 8.
|
||||
private static String[] copyOf(String[] orig, int size) {
|
||||
final String[] copy = new String[size];
|
||||
System.arraycopy(orig, 0, copy, 0,
|
||||
Math.min(orig.length, size));
|
||||
return copy;
|
||||
}
|
||||
|
||||
int indexOfKey(String key) {
|
||||
Validate.notNull(key);
|
||||
for (int i = 0; i < size; i++) {
|
||||
if (key.equals(keys[i]))
|
||||
return i;
|
||||
}
|
||||
return NotFound;
|
||||
}
|
||||
|
||||
private int indexOfKeyIgnoreCase(String key) {
|
||||
Validate.notNull(key);
|
||||
for (int i = 0; i < size; i++) {
|
||||
if (key.equalsIgnoreCase(keys[i]))
|
||||
return i;
|
||||
}
|
||||
return NotFound;
|
||||
}
|
||||
|
||||
// we track boolean attributes as null in values - they're just keys. so returns empty for consumers
|
||||
static String checkNotNull(String val) {
|
||||
return val == null ? EmptyString : val;
|
||||
}
|
||||
|
||||
/**
|
||||
Get an attribute value by key.
|
||||
@param key the (case-sensitive) attribute key
|
||||
@return the attribute value if set; or empty string if not set (or a boolean attribute).
|
||||
@see #hasKey(String)
|
||||
*/
|
||||
public String get(String key) {
|
||||
int i = indexOfKey(key);
|
||||
return i == NotFound ? EmptyString : checkNotNull(vals[i]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an attribute's value by case-insensitive key
|
||||
* @param key the attribute name
|
||||
* @return the first matching attribute value if set; or empty string if not set (ora boolean attribute).
|
||||
*/
|
||||
public String getIgnoreCase(String key) {
|
||||
int i = indexOfKeyIgnoreCase(key);
|
||||
return i == NotFound ? EmptyString : checkNotNull(vals[i]);
|
||||
}
|
||||
|
||||
// adds without checking if this key exists
|
||||
private void add(String key, String value) {
|
||||
checkCapacity(size + 1);
|
||||
keys[size] = key;
|
||||
vals[size] = value;
|
||||
size++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a new attribute, or replace an existing one by key.
|
||||
* @param key case sensitive attribute key
|
||||
* @param value attribute value
|
||||
* @return these attributes, for chaining
|
||||
*/
|
||||
public Attributes put(String key, String value) {
|
||||
int i = indexOfKey(key);
|
||||
if (i != NotFound)
|
||||
vals[i] = value;
|
||||
else
|
||||
add(key, value);
|
||||
return this;
|
||||
}
|
||||
|
||||
void putIgnoreCase(String key, String value) {
|
||||
int i = indexOfKeyIgnoreCase(key);
|
||||
if (i != NotFound) {
|
||||
vals[i] = value;
|
||||
if (!keys[i].equals(key)) // case changed, update
|
||||
keys[i] = key;
|
||||
}
|
||||
else
|
||||
add(key, value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a new boolean attribute, remove attribute if value is false.
|
||||
* @param key case <b>insensitive</b> attribute key
|
||||
* @param value attribute value
|
||||
* @return these attributes, for chaining
|
||||
*/
|
||||
public Attributes put(String key, boolean value) {
|
||||
if (value)
|
||||
putIgnoreCase(key, null);
|
||||
else
|
||||
remove(key);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
Set a new attribute, or replace an existing one by key.
|
||||
@param attribute attribute with case sensitive key
|
||||
@return these attributes, for chaining
|
||||
*/
|
||||
public Attributes put(Attribute attribute) {
|
||||
Validate.notNull(attribute);
|
||||
put(attribute.getKey(), attribute.getValue());
|
||||
attribute.parent = this;
|
||||
return this;
|
||||
}
|
||||
|
||||
// removes and shifts up
|
||||
private void remove(int index) {
|
||||
Validate.isFalse(index >= size);
|
||||
int shifted = size - index - 1;
|
||||
if (shifted > 0) {
|
||||
System.arraycopy(keys, index + 1, keys, index, shifted);
|
||||
System.arraycopy(vals, index + 1, vals, index, shifted);
|
||||
}
|
||||
size--;
|
||||
keys[size] = null; // release hold
|
||||
vals[size] = null;
|
||||
}
|
||||
|
||||
/**
|
||||
Remove an attribute by key. <b>Case sensitive.</b>
|
||||
@param key attribute key to remove
|
||||
*/
|
||||
public void remove(String key) {
|
||||
int i = indexOfKey(key);
|
||||
if (i != NotFound)
|
||||
remove(i);
|
||||
}
|
||||
|
||||
/**
|
||||
Remove an attribute by key. <b>Case insensitive.</b>
|
||||
@param key attribute key to remove
|
||||
*/
|
||||
public void removeIgnoreCase(String key) {
|
||||
int i = indexOfKeyIgnoreCase(key);
|
||||
if (i != NotFound)
|
||||
remove(i);
|
||||
}
|
||||
|
||||
/**
|
||||
Tests if these attributes contain an attribute with this key.
|
||||
@param key case-sensitive key to check for
|
||||
@return true if key exists, false otherwise
|
||||
*/
|
||||
public boolean hasKey(String key) {
|
||||
return indexOfKey(key) != NotFound;
|
||||
}
|
||||
|
||||
/**
|
||||
Tests if these attributes contain an attribute with this key.
|
||||
@param key key to check for
|
||||
@return true if key exists, false otherwise
|
||||
*/
|
||||
public boolean hasKeyIgnoreCase(String key) {
|
||||
return indexOfKeyIgnoreCase(key) != NotFound;
|
||||
}
|
||||
|
||||
/**
|
||||
Get the number of attributes in this set.
|
||||
@return size
|
||||
*/
|
||||
public int size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
/**
|
||||
Add all the attributes from the incoming set to this set.
|
||||
@param incoming attributes to add to these attributes.
|
||||
*/
|
||||
public void addAll(Attributes incoming) {
|
||||
if (incoming.size() == 0)
|
||||
return;
|
||||
checkCapacity(size + incoming.size);
|
||||
|
||||
for (Attribute attr : incoming) {
|
||||
// todo - should this be case insensitive?
|
||||
put(attr);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public Iterator<Attribute> iterator() {
|
||||
return new Iterator<Attribute>() {
|
||||
int i = 0;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return i < size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Attribute next() {
|
||||
final Attribute attr = new Attribute(keys[i], vals[i], Attributes.this);
|
||||
i++;
|
||||
return attr;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
Attributes.this.remove(--i); // next() advanced, so rewind
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
Get the attributes as a List, for iteration.
|
||||
@return an view of the attributes as an unmodifialbe List.
|
||||
*/
|
||||
public List<Attribute> asList() {
|
||||
ArrayList<Attribute> list = new ArrayList<>(size);
|
||||
for (int i = 0; i < size; i++) {
|
||||
// Attribute attr = vals[i] == null ?
|
||||
// new BooleanAttribute(keys[i]) : // deprecated class, but maybe someone still wants it
|
||||
// new Attribute(keys[i], vals[i], Attributes.this);
|
||||
// list.add(attr);
|
||||
list.add(new Attribute(keys[i], vals[i], Attributes.this));
|
||||
}
|
||||
return Collections.unmodifiableList(list);
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves a filtered view of attributes that are HTML5 custom data attributes; that is, attributes with keys
|
||||
* starting with {@code data-}.
|
||||
* @return map of custom data attributes.
|
||||
*/
|
||||
public Map<String, String> dataset() {
|
||||
return new Dataset(this);
|
||||
}
|
||||
|
||||
// /**
|
||||
// Get the HTML representation of these attributes.
|
||||
// @return HTML
|
||||
// @throws SerializationException if the HTML representation of the attributes cannot be constructed.
|
||||
// */
|
||||
// public String html() {
|
||||
// StringBuilder accum = new StringBuilder();
|
||||
// try {
|
||||
// html(accum, (new Document("")).outputSettings()); // output settings a bit funky, but this html() seldom used
|
||||
// } catch (IOException e) { // ought never happen
|
||||
// throw new SerializationException(e);
|
||||
// }
|
||||
// return accum.toString();
|
||||
// }
|
||||
//
|
||||
// final void html(final Appendable accum, final Document.OutputSettings out) throws IOException {
|
||||
// final int sz = size;
|
||||
// for (int i = 0; i < sz; i++) {
|
||||
// // inlined from Attribute.html()
|
||||
// final String key = keys[i];
|
||||
// final String val = vals[i];
|
||||
// accum.append(' ').append(key);
|
||||
//
|
||||
// // collapse checked=null, checked="", checked=checked; write out others
|
||||
// if (!Attribute.shouldCollapseAttribute(key, val, out)) {
|
||||
// accum.append("=\"");
|
||||
// Entities.escape(accum, val == null ? EmptyString : val, out, true, false, false);
|
||||
// accum.append('"');
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// public String toString() {
|
||||
// return html();
|
||||
// }
|
||||
|
||||
/**
|
||||
* Checks if these attributes are equal to another set of attributes, by comparing the two sets
|
||||
* @param o attributes to compare with
|
||||
* @return if both sets of attributes have the same content
|
||||
*/
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
|
||||
Attributes that = (Attributes) o;
|
||||
|
||||
if (size != that.size) return false;
|
||||
if (!Arrays.equals(keys, that.keys)) return false;
|
||||
return Arrays.equals(vals, that.vals);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the hashcode of these attributes, by iterating all attributes and summing their hashcodes.
|
||||
* @return calculated hashcode
|
||||
*/
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = size;
|
||||
result = 31 * result + Arrays.hashCode(keys);
|
||||
result = 31 * result + Arrays.hashCode(vals);
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Attributes clone() {
|
||||
Attributes clone;
|
||||
try {
|
||||
clone = (Attributes) super.clone();
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
clone.size = size;
|
||||
keys = copyOf(keys, size);
|
||||
vals = copyOf(vals, size);
|
||||
return clone;
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal method. Lowercases all keys.
|
||||
*/
|
||||
public void normalize() {
|
||||
for (int i = 0; i < size; i++) {
|
||||
keys[i] = lowerCase(keys[i]);
|
||||
}
|
||||
}
|
||||
|
||||
private static class Dataset extends AbstractMap<String, String> {
|
||||
private final Attributes attributes;
|
||||
|
||||
private Dataset(Attributes attributes) {
|
||||
this.attributes = attributes;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<Entry<String, String>> entrySet() {
|
||||
return new EntrySet();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String put(String key, String value) {
|
||||
String dataKey = dataKey(key);
|
||||
String oldValue = attributes.hasKey(dataKey) ? attributes.get(dataKey) : null;
|
||||
attributes.put(dataKey, value);
|
||||
return oldValue;
|
||||
}
|
||||
|
||||
private class EntrySet extends AbstractSet<Map.Entry<String, String>> {
|
||||
|
||||
@Override
|
||||
public Iterator<Map.Entry<String, String>> iterator() {
|
||||
return new DatasetIterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
int count = 0;
|
||||
Iterator iter = new DatasetIterator();
|
||||
while (iter.hasNext())
|
||||
count++;
|
||||
return count;
|
||||
}
|
||||
}
|
||||
|
||||
private class DatasetIterator implements Iterator<Map.Entry<String, String>> {
|
||||
private Iterator<Attribute> attrIter = attributes.iterator();
|
||||
private Attribute attr;
|
||||
public boolean hasNext() {
|
||||
while (attrIter.hasNext()) {
|
||||
attr = attrIter.next();
|
||||
if (attr.isDataAttribute()) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public Entry<String, String> next() {
|
||||
return new Attribute(attr.getKey().substring(dataPrefix.length()), attr.getValue());
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
attributes.remove(attr.getKey());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static String dataKey(String key) {
|
||||
return dataPrefix + key;
|
||||
}
|
||||
}
|
@ -0,0 +1,104 @@
|
||||
package ru.noties.markwon.html.jsoup.nodes;
|
||||
|
||||
/**
|
||||
* A {@code <!DOCTYPE>} node.
|
||||
*/
|
||||
public class DocumentType /*extends LeafNode*/ {
|
||||
// todo needs a bit of a chunky cleanup. this level of detail isn't needed
|
||||
public static final String PUBLIC_KEY = "PUBLIC";
|
||||
public static final String SYSTEM_KEY = "SYSTEM";
|
||||
// private static final String NAME = "name";
|
||||
// private static final String PUB_SYS_KEY = "pubSysKey"; // PUBLIC or SYSTEM
|
||||
// private static final String PUBLIC_ID = "publicId";
|
||||
// private static final String SYSTEM_ID = "systemId";
|
||||
// todo: quirk mode from publicId and systemId
|
||||
|
||||
// /**
|
||||
// * Create a new doctype element.
|
||||
// * @param name the doctype's name
|
||||
// * @param publicId the doctype's public ID
|
||||
// * @param systemId the doctype's system ID
|
||||
// */
|
||||
// public DocumentType(String name, String publicId, String systemId) {
|
||||
// Validate.notNull(name);
|
||||
// Validate.notNull(publicId);
|
||||
// Validate.notNull(systemId);
|
||||
// attr(NAME, name);
|
||||
// attr(PUBLIC_ID, publicId);
|
||||
// if (has(PUBLIC_ID)) {
|
||||
// attr(PUB_SYS_KEY, PUBLIC_KEY);
|
||||
// }
|
||||
// attr(SYSTEM_ID, systemId);
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * Create a new doctype element.
|
||||
// * @param name the doctype's name
|
||||
// * @param publicId the doctype's public ID
|
||||
// * @param systemId the doctype's system ID
|
||||
// * @param baseUri unused
|
||||
// * @deprecated
|
||||
// */
|
||||
// public DocumentType(String name, String publicId, String systemId, String baseUri) {
|
||||
// attr(NAME, name);
|
||||
// attr(PUBLIC_ID, publicId);
|
||||
// if (has(PUBLIC_ID)) {
|
||||
// attr(PUB_SYS_KEY, PUBLIC_KEY);
|
||||
// }
|
||||
// attr(SYSTEM_ID, systemId);
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * Create a new doctype element.
|
||||
// * @param name the doctype's name
|
||||
// * @param publicId the doctype's public ID
|
||||
// * @param systemId the doctype's system ID
|
||||
// * @param baseUri unused
|
||||
// * @deprecated
|
||||
// */
|
||||
// public DocumentType(String name, String pubSysKey, String publicId, String systemId, String baseUri) {
|
||||
// attr(NAME, name);
|
||||
// if (pubSysKey != null) {
|
||||
// attr(PUB_SYS_KEY, pubSysKey);
|
||||
// }
|
||||
// attr(PUBLIC_ID, publicId);
|
||||
// attr(SYSTEM_ID, systemId);
|
||||
// }
|
||||
// public void setPubSysKey(String value) {
|
||||
// if (value != null)
|
||||
// attr(PUB_SYS_KEY, value);
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// public String nodeName() {
|
||||
// return "#doctype";
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
|
||||
// if (out.syntax() == Syntax.html && !has(PUBLIC_ID) && !has(SYSTEM_ID)) {
|
||||
// // looks like a html5 doctype, go lowercase for aesthetics
|
||||
// accum.append("<!doctype");
|
||||
// } else {
|
||||
// accum.append("<!DOCTYPE");
|
||||
// }
|
||||
// if (has(NAME))
|
||||
// accum.append(" ").append(attr(NAME));
|
||||
// if (has(PUB_SYS_KEY))
|
||||
// accum.append(" ").append(attr(PUB_SYS_KEY));
|
||||
// if (has(PUBLIC_ID))
|
||||
// accum.append(" \"").append(attr(PUBLIC_ID)).append('"');
|
||||
// if (has(SYSTEM_ID))
|
||||
// accum.append(" \"").append(attr(SYSTEM_ID)).append('"');
|
||||
// accum.append('>');
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) {
|
||||
// }
|
||||
//
|
||||
// private boolean has(final String attribute) {
|
||||
// return !StringUtil.isBlank(attr(attribute));
|
||||
// }
|
||||
}
|
||||
|
@ -0,0 +1,351 @@
|
||||
package ru.noties.markwon.html.jsoup.nodes;
|
||||
|
||||
import java.nio.charset.CharsetEncoder;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
|
||||
import ru.noties.markwon.html.jsoup.helper.Validate;
|
||||
import ru.noties.markwon.html.jsoup.parser.CharacterReader;
|
||||
|
||||
import static ru.noties.markwon.html.jsoup.nodes.Entities.EscapeMode.base;
|
||||
import static ru.noties.markwon.html.jsoup.nodes.Entities.EscapeMode.extended;
|
||||
|
||||
/**
|
||||
* HTML entities, and escape routines. Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C
|
||||
* HTML named character references</a>.
|
||||
*/
|
||||
public class Entities {
|
||||
private static final int empty = -1;
|
||||
private static final String emptyName = "";
|
||||
static final int codepointRadix = 36;
|
||||
private static final char[] codeDelims = {',', ';'};
|
||||
private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references
|
||||
// private static final Document.OutputSettings DefaultOutput = new Document.OutputSettings();
|
||||
|
||||
public enum EscapeMode {
|
||||
/**
|
||||
* Restricted entities suitable for XHTML output: lt, gt, amp, and quot only.
|
||||
*/
|
||||
xhtml(EntitiesData.xmlPoints, 4),
|
||||
/**
|
||||
* Default HTML output entities.
|
||||
*/
|
||||
base(EntitiesData.basePoints, 106),
|
||||
/**
|
||||
* Complete HTML entities.
|
||||
*/
|
||||
extended(EntitiesData.fullPoints, 2125);
|
||||
|
||||
// table of named references to their codepoints. sorted so we can binary search. built by BuildEntities.
|
||||
private String[] nameKeys;
|
||||
private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints.
|
||||
|
||||
// table of codepoints to named entities.
|
||||
private int[] codeKeys; // we don' support multicodepoints to single named value currently
|
||||
private String[] nameVals;
|
||||
|
||||
EscapeMode(String file, int size) {
|
||||
load(this, file, size);
|
||||
}
|
||||
|
||||
int codepointForName(final String name) {
|
||||
int index = Arrays.binarySearch(nameKeys, name);
|
||||
return index >= 0 ? codeVals[index] : empty;
|
||||
}
|
||||
|
||||
String nameForCodepoint(final int codepoint) {
|
||||
final int index = Arrays.binarySearch(codeKeys, codepoint);
|
||||
if (index >= 0) {
|
||||
// the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower
|
||||
// (and binary search for same item with multi results is undefined
|
||||
return (index < nameVals.length - 1 && codeKeys[index + 1] == codepoint) ?
|
||||
nameVals[index + 1] : nameVals[index];
|
||||
}
|
||||
return emptyName;
|
||||
}
|
||||
|
||||
private int size() {
|
||||
return nameKeys.length;
|
||||
}
|
||||
}
|
||||
|
||||
private Entities() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the input is a known named entity
|
||||
*
|
||||
* @param name the possible entity name (e.g. "lt" or "amp")
|
||||
* @return true if a known named entity
|
||||
*/
|
||||
public static boolean isNamedEntity(final String name) {
|
||||
return extended.codepointForName(name) != empty;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the input is a known named entity in the base entity set.
|
||||
*
|
||||
* @param name the possible entity name (e.g. "lt" or "amp")
|
||||
* @return true if a known named entity in the base set
|
||||
* @see #isNamedEntity(String)
|
||||
*/
|
||||
public static boolean isBaseNamedEntity(final String name) {
|
||||
return base.codepointForName(name) != empty;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the Character value of the named entity
|
||||
*
|
||||
* @param name named entity (e.g. "lt" or "amp")
|
||||
* @return the Character value of the named entity (e.g. '{@literal <}' or '{@literal &}')
|
||||
* @deprecated does not support characters outside the BMP or multiple character names
|
||||
*/
|
||||
public static Character getCharacterByName(String name) {
|
||||
return (char) extended.codepointForName(name);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the character(s) represented by the named entity
|
||||
*
|
||||
* @param name entity (e.g. "lt" or "amp")
|
||||
* @return the string value of the character(s) represented by this entity, or "" if not defined
|
||||
*/
|
||||
public static String getByName(String name) {
|
||||
String val = multipoints.get(name);
|
||||
if (val != null)
|
||||
return val;
|
||||
int codepoint = extended.codepointForName(name);
|
||||
if (codepoint != empty)
|
||||
return new String(new int[]{codepoint}, 0, 1);
|
||||
return emptyName;
|
||||
}
|
||||
|
||||
public static int codepointsForName(final String name, final int[] codepoints) {
|
||||
String val = multipoints.get(name);
|
||||
if (val != null) {
|
||||
codepoints[0] = val.codePointAt(0);
|
||||
codepoints[1] = val.codePointAt(1);
|
||||
return 2;
|
||||
}
|
||||
int codepoint = extended.codepointForName(name);
|
||||
if (codepoint != empty) {
|
||||
codepoints[0] = codepoint;
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// /**
|
||||
// * HTML escape an input string. That is, {@code <} is returned as {@code <}
|
||||
// *
|
||||
// * @param string the un-escaped string to escape
|
||||
// * @param out the output settings to use
|
||||
// * @return the escaped string
|
||||
// */
|
||||
// public static String escape(String string, Document.OutputSettings out) {
|
||||
// if (string == null)
|
||||
// return "";
|
||||
// StringBuilder accum = new StringBuilder(string.length() * 2);
|
||||
// try {
|
||||
// escape(accum, string, out, false, false, false);
|
||||
// } catch (IOException e) {
|
||||
// throw new SerializationException(e); // doesn't happen
|
||||
// }
|
||||
// return accum.toString();
|
||||
// }
|
||||
|
||||
// /**
|
||||
// * HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is returned as
|
||||
// * {@code <}
|
||||
// *
|
||||
// * @param string the un-escaped string to escape
|
||||
// * @return the escaped string
|
||||
// */
|
||||
// public static String escape(String string) {
|
||||
// return escape(string, DefaultOutput);
|
||||
// }
|
||||
//
|
||||
// // this method is ugly, and does a lot. but other breakups cause rescanning and stringbuilder generations
|
||||
// static void escape(Appendable accum, String string, Document.OutputSettings out,
|
||||
// boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite) throws IOException {
|
||||
//
|
||||
// boolean lastWasWhite = false;
|
||||
// boolean reachedNonWhite = false;
|
||||
// final EscapeMode escapeMode = out.escapeMode();
|
||||
// final CharsetEncoder encoder = out.encoder();
|
||||
// final CoreCharset coreCharset = out.coreCharset; // init in out.prepareEncoder()
|
||||
// final int length = string.length();
|
||||
//
|
||||
// int codePoint;
|
||||
// for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) {
|
||||
// codePoint = string.codePointAt(offset);
|
||||
//
|
||||
// if (normaliseWhite) {
|
||||
// if (StringUtil.isWhitespace(codePoint)) {
|
||||
// if ((stripLeadingWhite && !reachedNonWhite) || lastWasWhite)
|
||||
// continue;
|
||||
// accum.append(' ');
|
||||
// lastWasWhite = true;
|
||||
// continue;
|
||||
// } else {
|
||||
// lastWasWhite = false;
|
||||
// reachedNonWhite = true;
|
||||
// }
|
||||
// }
|
||||
// // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
|
||||
// if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
|
||||
// final char c = (char) codePoint;
|
||||
// // html specific and required escapes:
|
||||
// switch (c) {
|
||||
// case '&':
|
||||
// accum.append("&");
|
||||
// break;
|
||||
// case 0xA0:
|
||||
// if (escapeMode != EscapeMode.xhtml)
|
||||
// accum.append(" ");
|
||||
// else
|
||||
// accum.append(" ");
|
||||
// break;
|
||||
// case '<':
|
||||
// // escape when in character data or when in a xml attribue val; not needed in html attr val
|
||||
// if (!inAttribute || escapeMode == EscapeMode.xhtml)
|
||||
// accum.append("<");
|
||||
// else
|
||||
// accum.append(c);
|
||||
// break;
|
||||
// case '>':
|
||||
// if (!inAttribute)
|
||||
// accum.append(">");
|
||||
// else
|
||||
// accum.append(c);
|
||||
// break;
|
||||
// case '"':
|
||||
// if (inAttribute)
|
||||
// accum.append(""");
|
||||
// else
|
||||
// accum.append(c);
|
||||
// break;
|
||||
// default:
|
||||
// if (canEncode(coreCharset, c, encoder))
|
||||
// accum.append(c);
|
||||
// else
|
||||
// appendEncoded(accum, escapeMode, codePoint);
|
||||
// }
|
||||
// } else {
|
||||
// final String c = new String(Character.toChars(codePoint));
|
||||
// if (encoder.canEncode(c)) // uses fallback encoder for simplicity
|
||||
// accum.append(c);
|
||||
// else
|
||||
// appendEncoded(accum, escapeMode, codePoint);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException {
|
||||
// final String name = escapeMode.nameForCodepoint(codePoint);
|
||||
// if (name != emptyName) // ok for identity check
|
||||
// accum.append('&').append(name).append(';');
|
||||
// else
|
||||
// accum.append("&#x").append(Integer.toHexString(codePoint)).append(';');
|
||||
// }
|
||||
|
||||
// /**
|
||||
// * Un-escape an HTML escaped string. That is, {@code <} is returned as {@code <}.
|
||||
// *
|
||||
// * @param string the HTML string to un-escape
|
||||
// * @return the unescaped string
|
||||
// */
|
||||
// public static String unescape(String string) {
|
||||
// return unescape(string, false);
|
||||
// }
|
||||
|
||||
// /**
|
||||
// * Unescape the input string.
|
||||
// *
|
||||
// * @param string to un-HTML-escape
|
||||
// * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional)
|
||||
// * @return unescaped string
|
||||
// */
|
||||
// static String unescape(String string, boolean strict) {
|
||||
// return Parser.unescapeEntities(string, strict);
|
||||
// }
|
||||
|
||||
/*
|
||||
* Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean.
|
||||
* After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF,
|
||||
* performance may be bad. We can add more encoders for common character sets that are impacted by performance
|
||||
* issues on Android if required.
|
||||
*
|
||||
* Benchmarks: *
|
||||
* OLD toHtml() impl v New (fastpath) in millis
|
||||
* Wiki: 1895, 16
|
||||
* CNN: 6378, 55
|
||||
* Alterslash: 3013, 28
|
||||
* Jsoup: 167, 2
|
||||
*/
|
||||
private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) {
|
||||
// todo add more charset tests if impacted by Android's bad perf in canEncode
|
||||
switch (charset) {
|
||||
case ascii:
|
||||
return c < 0x80;
|
||||
case utf:
|
||||
return true; // real is:!(Character.isLowSurrogate(c) || Character.isHighSurrogate(c)); - but already check above
|
||||
default:
|
||||
return fallback.canEncode(c);
|
||||
}
|
||||
}
|
||||
|
||||
enum CoreCharset {
|
||||
ascii, utf, fallback;
|
||||
|
||||
static CoreCharset byName(final String name) {
|
||||
if (name.equals("US-ASCII"))
|
||||
return ascii;
|
||||
if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al
|
||||
return utf;
|
||||
return fallback;
|
||||
}
|
||||
}
|
||||
|
||||
private static void load(EscapeMode e, String pointsData, int size) {
|
||||
e.nameKeys = new String[size];
|
||||
e.codeVals = new int[size];
|
||||
e.codeKeys = new int[size];
|
||||
e.nameVals = new String[size];
|
||||
|
||||
int i = 0;
|
||||
CharacterReader reader = new CharacterReader(pointsData);
|
||||
|
||||
while (!reader.isEmpty()) {
|
||||
// NotNestedLessLess=10913,824;1887&
|
||||
|
||||
final String name = reader.consumeTo('=');
|
||||
reader.advance();
|
||||
final int cp1 = Integer.parseInt(reader.consumeToAny(codeDelims), codepointRadix);
|
||||
final char codeDelim = reader.current();
|
||||
reader.advance();
|
||||
final int cp2;
|
||||
if (codeDelim == ',') {
|
||||
cp2 = Integer.parseInt(reader.consumeTo(';'), codepointRadix);
|
||||
reader.advance();
|
||||
} else {
|
||||
cp2 = empty;
|
||||
}
|
||||
final String indexS = reader.consumeTo('&');
|
||||
final int index = Integer.parseInt(indexS, codepointRadix);
|
||||
reader.advance();
|
||||
|
||||
e.nameKeys[i] = name;
|
||||
e.codeVals[i] = cp1;
|
||||
e.codeKeys[index] = cp1;
|
||||
e.nameVals[index] = name;
|
||||
|
||||
if (cp2 != empty) {
|
||||
multipoints.put(name, new String(new int[]{cp1, cp2}, 0, 2));
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
Validate.isTrue(i == size, "Unexpected count of entities loaded");
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,483 @@
|
||||
package ru.noties.markwon.html.jsoup.parser;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Locale;
|
||||
|
||||
import ru.noties.markwon.html.jsoup.UncheckedIOException;
|
||||
import ru.noties.markwon.html.jsoup.helper.Validate;
|
||||
|
||||
/**
|
||||
CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes.
|
||||
*/
|
||||
public final class CharacterReader {
|
||||
static final char EOF = (char) -1;
|
||||
private static final int maxStringCacheLen = 12;
|
||||
static final int maxBufferLen = 1024 * 32; // visible for testing
|
||||
private static final int readAheadLimit = (int) (maxBufferLen * 0.75);
|
||||
|
||||
private final char[] charBuf;
|
||||
private final Reader reader;
|
||||
private int bufLength;
|
||||
private int bufSplitPoint;
|
||||
private int bufPos;
|
||||
private int readerPos;
|
||||
private int bufMark;
|
||||
private final String[] stringCache = new String[512]; // holds reused strings in this doc, to lessen garbage
|
||||
|
||||
public CharacterReader(Reader input, int sz) {
|
||||
Validate.notNull(input);
|
||||
Validate.isTrue(input.markSupported());
|
||||
reader = input;
|
||||
charBuf = new char[sz > maxBufferLen ? maxBufferLen : sz];
|
||||
bufferUp();
|
||||
}
|
||||
|
||||
public CharacterReader(Reader input) {
|
||||
this(input, maxBufferLen);
|
||||
}
|
||||
|
||||
public CharacterReader(String input) {
|
||||
this(new StringReader(input), input.length());
|
||||
}
|
||||
|
||||
private void bufferUp() {
|
||||
if (bufPos < bufSplitPoint)
|
||||
return;
|
||||
|
||||
try {
|
||||
reader.skip(bufPos);
|
||||
reader.mark(maxBufferLen);
|
||||
final int read = reader.read(charBuf);
|
||||
reader.reset();
|
||||
if (read != -1) {
|
||||
bufLength = read;
|
||||
readerPos += bufPos;
|
||||
bufPos = 0;
|
||||
bufMark = 0;
|
||||
bufSplitPoint = bufLength > readAheadLimit ? readAheadLimit : bufLength;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the current cursor position in the content.
|
||||
* @return current position
|
||||
*/
|
||||
public int pos() {
|
||||
return readerPos + bufPos;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests if all the content has been read.
|
||||
* @return true if nothing left to read.
|
||||
*/
|
||||
public boolean isEmpty() {
|
||||
bufferUp();
|
||||
return bufPos >= bufLength;
|
||||
}
|
||||
|
||||
private boolean isEmptyNoBufferUp() {
|
||||
return bufPos >= bufLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the char at the current position.
|
||||
* @return char
|
||||
*/
|
||||
public char current() {
|
||||
bufferUp();
|
||||
return isEmptyNoBufferUp() ? EOF : charBuf[bufPos];
|
||||
}
|
||||
|
||||
char consume() {
|
||||
bufferUp();
|
||||
char val = isEmptyNoBufferUp() ? EOF : charBuf[bufPos];
|
||||
bufPos++;
|
||||
return val;
|
||||
}
|
||||
|
||||
void unconsume() {
|
||||
bufPos--;
|
||||
}
|
||||
|
||||
/**
|
||||
* Moves the current position by one.
|
||||
*/
|
||||
public void advance() {
|
||||
bufPos++;
|
||||
}
|
||||
|
||||
void mark() {
|
||||
bufMark = bufPos;
|
||||
}
|
||||
|
||||
void rewindToMark() {
|
||||
bufPos = bufMark;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of characters between the current position and the next instance of the input char
|
||||
* @param c scan target
|
||||
* @return offset between current position and next instance of target. -1 if not found.
|
||||
*/
|
||||
int nextIndexOf(char c) {
|
||||
// doesn't handle scanning for surrogates
|
||||
bufferUp();
|
||||
for (int i = bufPos; i < bufLength; i++) {
|
||||
if (c == charBuf[i])
|
||||
return i - bufPos;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of characters between the current position and the next instance of the input sequence
|
||||
*
|
||||
* @param seq scan target
|
||||
* @return offset between current position and next instance of target. -1 if not found.
|
||||
*/
|
||||
int nextIndexOf(CharSequence seq) {
|
||||
bufferUp();
|
||||
// doesn't handle scanning for surrogates
|
||||
char startChar = seq.charAt(0);
|
||||
for (int offset = bufPos; offset < bufLength; offset++) {
|
||||
// scan to first instance of startchar:
|
||||
if (startChar != charBuf[offset])
|
||||
while(++offset < bufLength && startChar != charBuf[offset]) { /* empty */ }
|
||||
int i = offset + 1;
|
||||
int last = i + seq.length()-1;
|
||||
if (offset < bufLength && last <= bufLength) {
|
||||
for (int j = 1; i < last && seq.charAt(j) == charBuf[i]; i++, j++) { /* empty */ }
|
||||
if (i == last) // found full sequence
|
||||
return offset - bufPos;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads characters up to the specific char.
|
||||
* @param c the delimiter
|
||||
* @return the chars read
|
||||
*/
|
||||
public String consumeTo(char c) {
|
||||
int offset = nextIndexOf(c);
|
||||
if (offset != -1) {
|
||||
String consumed = cacheString(charBuf, stringCache, bufPos, offset);
|
||||
bufPos += offset;
|
||||
return consumed;
|
||||
} else {
|
||||
return consumeToEnd();
|
||||
}
|
||||
}
|
||||
|
||||
String consumeTo(String seq) {
|
||||
int offset = nextIndexOf(seq);
|
||||
if (offset != -1) {
|
||||
String consumed = cacheString(charBuf, stringCache, bufPos, offset);
|
||||
bufPos += offset;
|
||||
return consumed;
|
||||
} else {
|
||||
return consumeToEnd();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read characters until the first of any delimiters is found.
|
||||
* @param chars delimiters to scan for
|
||||
* @return characters read up to the matched delimiter.
|
||||
*/
|
||||
public String consumeToAny(final char... chars) {
|
||||
bufferUp();
|
||||
final int start = bufPos;
|
||||
final int remaining = bufLength;
|
||||
final char[] val = charBuf;
|
||||
|
||||
OUTER: while (bufPos < remaining) {
|
||||
for (char c : chars) {
|
||||
if (val[bufPos] == c)
|
||||
break OUTER;
|
||||
}
|
||||
bufPos++;
|
||||
}
|
||||
|
||||
return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
|
||||
}
|
||||
|
||||
String consumeToAnySorted(final char... chars) {
|
||||
bufferUp();
|
||||
final int start = bufPos;
|
||||
final int remaining = bufLength;
|
||||
final char[] val = charBuf;
|
||||
|
||||
while (bufPos < remaining) {
|
||||
if (Arrays.binarySearch(chars, val[bufPos]) >= 0)
|
||||
break;
|
||||
bufPos++;
|
||||
}
|
||||
|
||||
return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
|
||||
}
|
||||
|
||||
String consumeData() {
|
||||
// &, <, null
|
||||
bufferUp();
|
||||
final int start = bufPos;
|
||||
final int remaining = bufLength;
|
||||
final char[] val = charBuf;
|
||||
|
||||
while (bufPos < remaining) {
|
||||
final char c = val[bufPos];
|
||||
if (c == '&'|| c == '<' || c == TokeniserState.nullChar)
|
||||
break;
|
||||
bufPos++;
|
||||
}
|
||||
|
||||
return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
|
||||
}
|
||||
|
||||
String consumeTagName() {
|
||||
// '\t', '\n', '\r', '\f', ' ', '/', '>', nullChar
|
||||
bufferUp();
|
||||
final int start = bufPos;
|
||||
final int remaining = bufLength;
|
||||
final char[] val = charBuf;
|
||||
|
||||
while (bufPos < remaining) {
|
||||
final char c = val[bufPos];
|
||||
if (c == '\t'|| c == '\n'|| c == '\r'|| c == '\f'|| c == ' '|| c == '/'|| c == '>'|| c == TokeniserState.nullChar)
|
||||
break;
|
||||
bufPos++;
|
||||
}
|
||||
|
||||
return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
|
||||
}
|
||||
|
||||
String consumeToEnd() {
|
||||
bufferUp();
|
||||
String data = cacheString(charBuf, stringCache, bufPos, bufLength - bufPos);
|
||||
bufPos = bufLength;
|
||||
return data;
|
||||
}
|
||||
|
||||
String consumeLetterSequence() {
|
||||
bufferUp();
|
||||
int start = bufPos;
|
||||
while (bufPos < bufLength) {
|
||||
char c = charBuf[bufPos];
|
||||
if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c))
|
||||
bufPos++;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
return cacheString(charBuf, stringCache, start, bufPos - start);
|
||||
}
|
||||
|
||||
String consumeLetterThenDigitSequence() {
|
||||
bufferUp();
|
||||
int start = bufPos;
|
||||
while (bufPos < bufLength) {
|
||||
char c = charBuf[bufPos];
|
||||
if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c))
|
||||
bufPos++;
|
||||
else
|
||||
break;
|
||||
}
|
||||
while (!isEmptyNoBufferUp()) {
|
||||
char c = charBuf[bufPos];
|
||||
if (c >= '0' && c <= '9')
|
||||
bufPos++;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
return cacheString(charBuf, stringCache, start, bufPos - start);
|
||||
}
|
||||
|
||||
String consumeHexSequence() {
|
||||
bufferUp();
|
||||
int start = bufPos;
|
||||
while (bufPos < bufLength) {
|
||||
char c = charBuf[bufPos];
|
||||
if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'))
|
||||
bufPos++;
|
||||
else
|
||||
break;
|
||||
}
|
||||
return cacheString(charBuf, stringCache, start, bufPos - start);
|
||||
}
|
||||
|
||||
String consumeDigitSequence() {
|
||||
bufferUp();
|
||||
int start = bufPos;
|
||||
while (bufPos < bufLength) {
|
||||
char c = charBuf[bufPos];
|
||||
if (c >= '0' && c <= '9')
|
||||
bufPos++;
|
||||
else
|
||||
break;
|
||||
}
|
||||
return cacheString(charBuf, stringCache, start, bufPos - start);
|
||||
}
|
||||
|
||||
boolean matches(char c) {
|
||||
return !isEmpty() && charBuf[bufPos] == c;
|
||||
|
||||
}
|
||||
|
||||
boolean matches(String seq) {
|
||||
bufferUp();
|
||||
int scanLength = seq.length();
|
||||
if (scanLength > bufLength - bufPos)
|
||||
return false;
|
||||
|
||||
for (int offset = 0; offset < scanLength; offset++)
|
||||
if (seq.charAt(offset) != charBuf[bufPos +offset])
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
boolean matchesIgnoreCase(String seq) {
|
||||
bufferUp();
|
||||
int scanLength = seq.length();
|
||||
if (scanLength > bufLength - bufPos)
|
||||
return false;
|
||||
|
||||
for (int offset = 0; offset < scanLength; offset++) {
|
||||
char upScan = Character.toUpperCase(seq.charAt(offset));
|
||||
char upTarget = Character.toUpperCase(charBuf[bufPos + offset]);
|
||||
if (upScan != upTarget)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
boolean matchesAny(char... seq) {
|
||||
if (isEmpty())
|
||||
return false;
|
||||
|
||||
bufferUp();
|
||||
char c = charBuf[bufPos];
|
||||
for (char seek : seq) {
|
||||
if (seek == c)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
boolean matchesAnySorted(char[] seq) {
|
||||
bufferUp();
|
||||
return !isEmpty() && Arrays.binarySearch(seq, charBuf[bufPos]) >= 0;
|
||||
}
|
||||
|
||||
boolean matchesLetter() {
|
||||
if (isEmpty())
|
||||
return false;
|
||||
char c = charBuf[bufPos];
|
||||
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c);
|
||||
}
|
||||
|
||||
boolean matchesDigit() {
|
||||
if (isEmpty())
|
||||
return false;
|
||||
char c = charBuf[bufPos];
|
||||
return (c >= '0' && c <= '9');
|
||||
}
|
||||
|
||||
boolean matchConsume(String seq) {
|
||||
bufferUp();
|
||||
if (matches(seq)) {
|
||||
bufPos += seq.length();
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
boolean matchConsumeIgnoreCase(String seq) {
|
||||
if (matchesIgnoreCase(seq)) {
|
||||
bufPos += seq.length();
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
boolean containsIgnoreCase(String seq) {
|
||||
// used to check presence of </title>, </style>. only finds consistent case.
|
||||
String loScan = seq.toLowerCase(Locale.ENGLISH);
|
||||
String hiScan = seq.toUpperCase(Locale.ENGLISH);
|
||||
return (nextIndexOf(loScan) > -1) || (nextIndexOf(hiScan) > -1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new String(charBuf, bufPos, bufLength - bufPos);
|
||||
}
|
||||
|
||||
/**
|
||||
* Caches short strings, as a flywheel pattern, to reduce GC load. Just for this doc, to prevent leaks.
|
||||
* <p />
|
||||
* Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list.
|
||||
* That saves both having to create objects as hash keys, and running through the entry list, at the expense of
|
||||
* some more duplicates.
|
||||
*/
|
||||
private static String cacheString(final char[] charBuf, final String[] stringCache, final int start, final int count) {
|
||||
// limit (no cache):
|
||||
if (count > maxStringCacheLen)
|
||||
return new String(charBuf, start, count);
|
||||
if (count < 1)
|
||||
return "";
|
||||
|
||||
// calculate hash:
|
||||
int hash = 0;
|
||||
int offset = start;
|
||||
for (int i = 0; i < count; i++) {
|
||||
hash = 31 * hash + charBuf[offset++];
|
||||
}
|
||||
|
||||
// get from cache
|
||||
final int index = hash & stringCache.length - 1;
|
||||
String cached = stringCache[index];
|
||||
|
||||
if (cached == null) { // miss, add
|
||||
cached = new String(charBuf, start, count);
|
||||
stringCache[index] = cached;
|
||||
} else { // hashcode hit, check equality
|
||||
if (rangeEquals(charBuf, start, count, cached)) { // hit
|
||||
return cached;
|
||||
} else { // hashcode conflict
|
||||
cached = new String(charBuf, start, count);
|
||||
stringCache[index] = cached; // update the cache, as recently used strings are more likely to show up again
|
||||
}
|
||||
}
|
||||
return cached;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the value of the provided range equals the string.
|
||||
*/
|
||||
static boolean rangeEquals(final char[] charBuf, final int start, int count, final String cached) {
|
||||
if (count == cached.length()) {
|
||||
int i = start;
|
||||
int j = 0;
|
||||
while (count-- != 0) {
|
||||
if (charBuf[i++] != cached.charAt(j++))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// just used for testing
|
||||
boolean rangeEquals(final int start, final int count, final String cached) {
|
||||
return rangeEquals(charBuf, start, count, cached);
|
||||
}
|
||||
}
|
@ -0,0 +1,41 @@
|
||||
package ru.noties.markwon.html.jsoup.parser;
|
||||
|
||||
/**
|
||||
* A Parse Error records an error in the input HTML that occurs in either the tokenisation or the tree building phase.
|
||||
*/
|
||||
public class ParseError {
|
||||
private int pos;
|
||||
private String errorMsg;
|
||||
|
||||
ParseError(int pos, String errorMsg) {
|
||||
this.pos = pos;
|
||||
this.errorMsg = errorMsg;
|
||||
}
|
||||
|
||||
ParseError(int pos, String errorFormat, Object... args) {
|
||||
this.errorMsg = String.format(errorFormat, args);
|
||||
this.pos = pos;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the error message.
|
||||
* @return the error message.
|
||||
*/
|
||||
public String getErrorMessage() {
|
||||
return errorMsg;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves the offset of the error.
|
||||
* @return error offset within input
|
||||
*/
|
||||
public int getPosition() {
|
||||
return pos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return pos + ": " + errorMsg;
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,34 @@
|
||||
package ru.noties.markwon.html.jsoup.parser;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* A container for ParseErrors.
|
||||
*
|
||||
* @author Jonathan Hedley
|
||||
*/
|
||||
public class ParseErrorList extends ArrayList<ParseError>{
|
||||
private static final int INITIAL_CAPACITY = 16;
|
||||
private final int maxSize;
|
||||
|
||||
ParseErrorList(int initialCapacity, int maxSize) {
|
||||
super(initialCapacity);
|
||||
this.maxSize = maxSize;
|
||||
}
|
||||
|
||||
boolean canAddError() {
|
||||
return size() < maxSize;
|
||||
}
|
||||
|
||||
int getMaxSize() {
|
||||
return maxSize;
|
||||
}
|
||||
|
||||
public static ParseErrorList noTracking() {
|
||||
return new ParseErrorList(0, 0);
|
||||
}
|
||||
|
||||
public static ParseErrorList tracking(int maxSize) {
|
||||
return new ParseErrorList(INITIAL_CAPACITY, maxSize);
|
||||
}
|
||||
}
|
@ -0,0 +1,398 @@
|
||||
package ru.noties.markwon.html.jsoup.parser;
|
||||
|
||||
import android.support.annotation.NonNull;
|
||||
|
||||
import ru.noties.markwon.html.jsoup.helper.Validate;
|
||||
import ru.noties.markwon.html.jsoup.nodes.Attributes;
|
||||
|
||||
import static ru.noties.markwon.html.jsoup.helper.Normalizer.lowerCase;
|
||||
|
||||
/**
|
||||
* Parse tokens for the Tokeniser.
|
||||
*/
|
||||
public abstract class Token {
|
||||
|
||||
public final TokenType type;
|
||||
|
||||
protected Token(@NonNull TokenType tokenType) {
|
||||
this.type = tokenType;
|
||||
}
|
||||
|
||||
// String tokenType() {
|
||||
// return this.getClass().getSimpleName();
|
||||
// }
|
||||
|
||||
/**
|
||||
* Reset the data represent by this token, for reuse. Prevents the need to create transfer objects for every
|
||||
* piece of data, which immediately get GCed.
|
||||
*/
|
||||
public abstract Token reset();
|
||||
|
||||
static void reset(StringBuilder sb) {
|
||||
if (sb != null) {
|
||||
sb.delete(0, sb.length());
|
||||
}
|
||||
}
|
||||
|
||||
public static final class Doctype extends Token {
|
||||
final StringBuilder name = new StringBuilder();
|
||||
String pubSysKey = null;
|
||||
final StringBuilder publicIdentifier = new StringBuilder();
|
||||
final StringBuilder systemIdentifier = new StringBuilder();
|
||||
boolean forceQuirks = false;
|
||||
|
||||
Doctype() {
|
||||
super(TokenType.Doctype);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Token reset() {
|
||||
reset(name);
|
||||
pubSysKey = null;
|
||||
reset(publicIdentifier);
|
||||
reset(systemIdentifier);
|
||||
forceQuirks = false;
|
||||
return this;
|
||||
}
|
||||
|
||||
String getName() {
|
||||
return name.toString();
|
||||
}
|
||||
|
||||
String getPubSysKey() {
|
||||
return pubSysKey;
|
||||
}
|
||||
|
||||
String getPublicIdentifier() {
|
||||
return publicIdentifier.toString();
|
||||
}
|
||||
|
||||
public String getSystemIdentifier() {
|
||||
return systemIdentifier.toString();
|
||||
}
|
||||
|
||||
public boolean isForceQuirks() {
|
||||
return forceQuirks;
|
||||
}
|
||||
}
|
||||
|
||||
public static abstract class Tag extends Token {
|
||||
|
||||
public String tagName;
|
||||
public String normalName; // lc version of tag name, for case insensitive tree build
|
||||
private String pendingAttributeName; // attribute names are generally caught in one hop, not accumulated
|
||||
private StringBuilder pendingAttributeValue = new StringBuilder(); // but values are accumulated, from e.g. & in hrefs
|
||||
private String pendingAttributeValueS; // try to get attr vals in one shot, vs Builder
|
||||
private boolean hasEmptyAttributeValue = false; // distinguish boolean attribute from empty string value
|
||||
private boolean hasPendingAttributeValue = false;
|
||||
public boolean selfClosing = false;
|
||||
public Attributes attributes; // start tags get attributes on construction. End tags get attributes on first new attribute (but only for parser convenience, not used).
|
||||
|
||||
protected Tag(@NonNull TokenType tokenType) {
|
||||
super(tokenType);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tag reset() {
|
||||
tagName = null;
|
||||
normalName = null;
|
||||
pendingAttributeName = null;
|
||||
reset(pendingAttributeValue);
|
||||
pendingAttributeValueS = null;
|
||||
hasEmptyAttributeValue = false;
|
||||
hasPendingAttributeValue = false;
|
||||
selfClosing = false;
|
||||
attributes = null;
|
||||
return this;
|
||||
}
|
||||
|
||||
final void newAttribute() {
|
||||
if (attributes == null)
|
||||
attributes = new Attributes();
|
||||
|
||||
if (pendingAttributeName != null) {
|
||||
// the tokeniser has skipped whitespace control chars, but trimming could collapse to empty for other control codes, so verify here
|
||||
pendingAttributeName = pendingAttributeName.trim();
|
||||
if (pendingAttributeName.length() > 0) {
|
||||
String value;
|
||||
if (hasPendingAttributeValue)
|
||||
value = pendingAttributeValue.length() > 0 ? pendingAttributeValue.toString() : pendingAttributeValueS;
|
||||
else if (hasEmptyAttributeValue)
|
||||
value = "";
|
||||
else
|
||||
value = null;
|
||||
attributes.put(pendingAttributeName, value);
|
||||
}
|
||||
}
|
||||
pendingAttributeName = null;
|
||||
hasEmptyAttributeValue = false;
|
||||
hasPendingAttributeValue = false;
|
||||
reset(pendingAttributeValue);
|
||||
pendingAttributeValueS = null;
|
||||
}
|
||||
|
||||
final void finaliseTag() {
|
||||
// finalises for emit
|
||||
if (pendingAttributeName != null) {
|
||||
// todo: check if attribute name exists; if so, drop and error
|
||||
newAttribute();
|
||||
}
|
||||
}
|
||||
|
||||
final String name() { // preserves case, for input into Tag.valueOf (which may drop case)
|
||||
Validate.isFalse(tagName == null || tagName.length() == 0);
|
||||
return tagName;
|
||||
}
|
||||
|
||||
final String normalName() { // loses case, used in tree building for working out where in tree it should go
|
||||
return normalName;
|
||||
}
|
||||
|
||||
final Tag name(String name) {
|
||||
tagName = name;
|
||||
normalName = lowerCase(name);
|
||||
return this;
|
||||
}
|
||||
|
||||
final boolean isSelfClosing() {
|
||||
return selfClosing;
|
||||
}
|
||||
|
||||
@SuppressWarnings({"TypeMayBeWeakened"})
|
||||
final Attributes getAttributes() {
|
||||
return attributes;
|
||||
}
|
||||
|
||||
// these appenders are rarely hit in not null state-- caused by null chars.
|
||||
final void appendTagName(String append) {
|
||||
tagName = tagName == null ? append : tagName.concat(append);
|
||||
normalName = lowerCase(tagName);
|
||||
}
|
||||
|
||||
final void appendTagName(char append) {
|
||||
appendTagName(String.valueOf(append));
|
||||
}
|
||||
|
||||
final void appendAttributeName(String append) {
|
||||
pendingAttributeName = pendingAttributeName == null ? append : pendingAttributeName.concat(append);
|
||||
}
|
||||
|
||||
final void appendAttributeName(char append) {
|
||||
appendAttributeName(String.valueOf(append));
|
||||
}
|
||||
|
||||
final void appendAttributeValue(String append) {
|
||||
ensureAttributeValue();
|
||||
if (pendingAttributeValue.length() == 0) {
|
||||
pendingAttributeValueS = append;
|
||||
} else {
|
||||
pendingAttributeValue.append(append);
|
||||
}
|
||||
}
|
||||
|
||||
final void appendAttributeValue(char append) {
|
||||
ensureAttributeValue();
|
||||
pendingAttributeValue.append(append);
|
||||
}
|
||||
|
||||
final void appendAttributeValue(char[] append) {
|
||||
ensureAttributeValue();
|
||||
pendingAttributeValue.append(append);
|
||||
}
|
||||
|
||||
final void appendAttributeValue(int[] appendCodepoints) {
|
||||
ensureAttributeValue();
|
||||
for (int codepoint : appendCodepoints) {
|
||||
pendingAttributeValue.appendCodePoint(codepoint);
|
||||
}
|
||||
}
|
||||
|
||||
final void setEmptyAttributeValue() {
|
||||
hasEmptyAttributeValue = true;
|
||||
}
|
||||
|
||||
private void ensureAttributeValue() {
|
||||
hasPendingAttributeValue = true;
|
||||
// if on second hit, we'll need to move to the builder
|
||||
if (pendingAttributeValueS != null) {
|
||||
pendingAttributeValue.append(pendingAttributeValueS);
|
||||
pendingAttributeValueS = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public final static class StartTag extends Tag {
|
||||
StartTag() {
|
||||
super(TokenType.StartTag);
|
||||
attributes = new Attributes();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tag reset() {
|
||||
super.reset();
|
||||
attributes = new Attributes();
|
||||
// todo - would prefer these to be null, but need to check Element assertions
|
||||
return this;
|
||||
}
|
||||
|
||||
StartTag nameAttr(String name, Attributes attributes) {
|
||||
this.tagName = name;
|
||||
this.attributes = attributes;
|
||||
normalName = lowerCase(tagName);
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
if (attributes != null && attributes.size() > 0)
|
||||
return "<" + name() + " " + attributes.toString() + ">";
|
||||
else
|
||||
return "<" + name() + ">";
|
||||
}
|
||||
}
|
||||
|
||||
public final static class EndTag extends Tag{
|
||||
EndTag() {
|
||||
super(TokenType.EndTag);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "</" + name() + ">";
|
||||
}
|
||||
}
|
||||
|
||||
public final static class Comment extends Token {
|
||||
final StringBuilder data = new StringBuilder();
|
||||
boolean bogus = false;
|
||||
|
||||
@Override
|
||||
public Token reset() {
|
||||
reset(data);
|
||||
bogus = false;
|
||||
return this;
|
||||
}
|
||||
|
||||
Comment() {
|
||||
super(TokenType.Comment);
|
||||
}
|
||||
|
||||
String getData() {
|
||||
return data.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "<!--" + getData() + "-->";
|
||||
}
|
||||
}
|
||||
|
||||
public static class Character extends Token {
|
||||
private String data;
|
||||
|
||||
Character() {
|
||||
super(TokenType.Character);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Token reset() {
|
||||
data = null;
|
||||
return this;
|
||||
}
|
||||
|
||||
Character data(String data) {
|
||||
this.data = data;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getData() {
|
||||
return data;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getData();
|
||||
}
|
||||
}
|
||||
|
||||
public final static class CData extends Character {
|
||||
CData(String data) {
|
||||
super();
|
||||
this.data(data);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "<![CDATA[" + getData() + "]]>";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public final static class EOF extends Token {
|
||||
EOF() {
|
||||
super(Token.TokenType.EOF);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Token reset() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
// final boolean isDoctype() {
|
||||
// return type == TokenType.Doctype;
|
||||
// }
|
||||
//
|
||||
// final Doctype asDoctype() {
|
||||
// return (Doctype) this;
|
||||
// }
|
||||
//
|
||||
// final boolean isStartTag() {
|
||||
// return type == TokenType.StartTag;
|
||||
// }
|
||||
//
|
||||
// final StartTag asStartTag() {
|
||||
// return (StartTag) this;
|
||||
// }
|
||||
//
|
||||
// final boolean isEndTag() {
|
||||
// return type == TokenType.EndTag;
|
||||
// }
|
||||
//
|
||||
// final EndTag asEndTag() {
|
||||
// return (EndTag) this;
|
||||
// }
|
||||
//
|
||||
// final boolean isComment() {
|
||||
// return type == TokenType.Comment;
|
||||
// }
|
||||
//
|
||||
// final Comment asComment() {
|
||||
// return (Comment) this;
|
||||
// }
|
||||
//
|
||||
// final boolean isCharacter() {
|
||||
// return type == TokenType.Character;
|
||||
// }
|
||||
//
|
||||
// final boolean isCData() {
|
||||
// return this instanceof CData;
|
||||
// }
|
||||
//
|
||||
// final Character asCharacter() {
|
||||
// return (Character) this;
|
||||
// }
|
||||
//
|
||||
// final boolean isEOF() {
|
||||
// return type == TokenType.EOF;
|
||||
// }
|
||||
|
||||
public enum TokenType {
|
||||
Doctype,
|
||||
StartTag,
|
||||
EndTag,
|
||||
Comment,
|
||||
Character, // note no CData - treated in builder as an extension of Character
|
||||
EOF
|
||||
}
|
||||
}
|
@ -0,0 +1,295 @@
|
||||
package ru.noties.markwon.html.jsoup.parser;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
import ru.noties.markwon.html.jsoup.helper.Validate;
|
||||
import ru.noties.markwon.html.jsoup.nodes.Entities;
|
||||
|
||||
/**
|
||||
* Readers the input stream into tokens.
|
||||
*/
|
||||
public final class Tokeniser {
|
||||
static final char replacementChar = '\uFFFD'; // replaces null character
|
||||
private static final char[] notCharRefCharsSorted = new char[]{'\t', '\n', '\r', '\f', ' ', '<', '&'};
|
||||
|
||||
// Some illegal character escapes are parsed by browsers as windows-1252 instead. See issue #1034
|
||||
// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
|
||||
static final int win1252ExtensionsStart = 0x80;
|
||||
static final int[] win1252Extensions = new int[] {
|
||||
// we could build this manually, but Windows-1252 is not a standard java charset so that could break on
|
||||
// some platforms - this table is verified with a test
|
||||
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
|
||||
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
|
||||
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
|
||||
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178,
|
||||
};
|
||||
|
||||
static {
|
||||
Arrays.sort(notCharRefCharsSorted);
|
||||
}
|
||||
|
||||
private final CharacterReader reader; // html input
|
||||
private final ParseErrorList errors; // errors found while tokenising
|
||||
|
||||
private TokeniserState state = TokeniserState.Data; // current tokenisation state
|
||||
private Token emitPending; // the token we are about to emit on next read
|
||||
private boolean isEmitPending = false;
|
||||
private String charsString = null; // characters pending an emit. Will fall to charsBuilder if more than one
|
||||
private StringBuilder charsBuilder = new StringBuilder(1024); // buffers characters to output as one token, if more than one emit per read
|
||||
StringBuilder dataBuffer = new StringBuilder(1024); // buffers data looking for </script>
|
||||
|
||||
Token.Tag tagPending; // tag we are building up
|
||||
Token.StartTag startPending = new Token.StartTag();
|
||||
Token.EndTag endPending = new Token.EndTag();
|
||||
Token.Character charPending = new Token.Character();
|
||||
Token.Doctype doctypePending = new Token.Doctype(); // doctype building up
|
||||
Token.Comment commentPending = new Token.Comment(); // comment building up
|
||||
private String lastStartTag; // the last start tag emitted, to test appropriate end tag
|
||||
|
||||
public Tokeniser(CharacterReader reader, ParseErrorList errors) {
|
||||
this.reader = reader;
|
||||
this.errors = errors;
|
||||
}
|
||||
|
||||
public Token read() {
|
||||
while (!isEmitPending)
|
||||
state.read(this, reader);
|
||||
|
||||
// if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read:
|
||||
if (charsBuilder.length() > 0) {
|
||||
String str = charsBuilder.toString();
|
||||
charsBuilder.delete(0, charsBuilder.length());
|
||||
charsString = null;
|
||||
return charPending.data(str);
|
||||
} else if (charsString != null) {
|
||||
Token token = charPending.data(charsString);
|
||||
charsString = null;
|
||||
return token;
|
||||
} else {
|
||||
isEmitPending = false;
|
||||
return emitPending;
|
||||
}
|
||||
}
|
||||
|
||||
void emit(Token token) {
|
||||
Validate.isFalse(isEmitPending, "There is an unread token pending!");
|
||||
|
||||
emitPending = token;
|
||||
isEmitPending = true;
|
||||
|
||||
if (token.type == Token.TokenType.StartTag) {
|
||||
Token.StartTag startTag = (Token.StartTag) token;
|
||||
lastStartTag = startTag.tagName;
|
||||
} else if (token.type == Token.TokenType.EndTag) {
|
||||
Token.EndTag endTag = (Token.EndTag) token;
|
||||
if (endTag.attributes != null)
|
||||
error("Attributes incorrectly present on end tag");
|
||||
}
|
||||
}
|
||||
|
||||
void emit(final String str) {
|
||||
// buffer strings up until last string token found, to emit only one token for a run of character refs etc.
|
||||
// does not set isEmitPending; read checks that
|
||||
if (charsString == null) {
|
||||
charsString = str;
|
||||
}
|
||||
else {
|
||||
if (charsBuilder.length() == 0) { // switching to string builder as more than one emit before read
|
||||
charsBuilder.append(charsString);
|
||||
}
|
||||
charsBuilder.append(str);
|
||||
}
|
||||
}
|
||||
|
||||
void emit(char[] chars) {
|
||||
emit(String.valueOf(chars));
|
||||
}
|
||||
|
||||
void emit(int[] codepoints) {
|
||||
emit(new String(codepoints, 0, codepoints.length));
|
||||
}
|
||||
|
||||
void emit(char c) {
|
||||
emit(String.valueOf(c));
|
||||
}
|
||||
|
||||
TokeniserState getState() {
|
||||
return state;
|
||||
}
|
||||
|
||||
void transition(TokeniserState state) {
|
||||
this.state = state;
|
||||
}
|
||||
|
||||
void advanceTransition(TokeniserState state) {
|
||||
reader.advance();
|
||||
this.state = state;
|
||||
}
|
||||
|
||||
final private int[] codepointHolder = new int[1]; // holder to not have to keep creating arrays
|
||||
final private int[] multipointHolder = new int[2];
|
||||
int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
|
||||
if (reader.isEmpty())
|
||||
return null;
|
||||
if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
|
||||
return null;
|
||||
if (reader.matchesAnySorted(notCharRefCharsSorted))
|
||||
return null;
|
||||
|
||||
final int[] codeRef = codepointHolder;
|
||||
reader.mark();
|
||||
if (reader.matchConsume("#")) { // numbered
|
||||
boolean isHexMode = reader.matchConsumeIgnoreCase("X");
|
||||
String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
|
||||
if (numRef.length() == 0) { // didn't match anything
|
||||
characterReferenceError("numeric reference with no numerals");
|
||||
reader.rewindToMark();
|
||||
return null;
|
||||
}
|
||||
if (!reader.matchConsume(";"))
|
||||
characterReferenceError("missing semicolon"); // missing semi
|
||||
int charval = -1;
|
||||
try {
|
||||
int base = isHexMode ? 16 : 10;
|
||||
charval = Integer.valueOf(numRef, base);
|
||||
} catch (NumberFormatException ignored) {
|
||||
} // skip
|
||||
if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
|
||||
characterReferenceError("character outside of valid range");
|
||||
codeRef[0] = replacementChar;
|
||||
return codeRef;
|
||||
} else {
|
||||
// fix illegal unicode characters to match browser behavior
|
||||
if (charval >= win1252ExtensionsStart && charval < win1252ExtensionsStart + win1252Extensions.length) {
|
||||
characterReferenceError("character is not a valid unicode code point");
|
||||
charval = win1252Extensions[charval - win1252ExtensionsStart];
|
||||
}
|
||||
|
||||
// todo: implement number replacement table
|
||||
// todo: check for extra illegal unicode points as parse errors
|
||||
codeRef[0] = charval;
|
||||
return codeRef;
|
||||
}
|
||||
} else { // named
|
||||
// get as many letters as possible, and look for matching entities.
|
||||
String nameRef = reader.consumeLetterThenDigitSequence();
|
||||
boolean looksLegit = reader.matches(';');
|
||||
// found if a base named entity without a ;, or an extended entity with the ;.
|
||||
boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));
|
||||
|
||||
if (!found) {
|
||||
reader.rewindToMark();
|
||||
if (looksLegit) // named with semicolon
|
||||
characterReferenceError(String.format("invalid named referenece '%s'", nameRef));
|
||||
return null;
|
||||
}
|
||||
if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
|
||||
// don't want that to match
|
||||
reader.rewindToMark();
|
||||
return null;
|
||||
}
|
||||
if (!reader.matchConsume(";"))
|
||||
characterReferenceError("missing semicolon"); // missing semi
|
||||
int numChars = Entities.codepointsForName(nameRef, multipointHolder);
|
||||
if (numChars == 1) {
|
||||
codeRef[0] = multipointHolder[0];
|
||||
return codeRef;
|
||||
} else if (numChars ==2) {
|
||||
return multipointHolder;
|
||||
} else {
|
||||
Validate.fail("Unexpected characters returned for " + nameRef);
|
||||
return multipointHolder;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Token.Tag createTagPending(boolean start) {
|
||||
tagPending = start ? startPending.reset() : endPending.reset();
|
||||
return tagPending;
|
||||
}
|
||||
|
||||
void emitTagPending() {
|
||||
tagPending.finaliseTag();
|
||||
emit(tagPending);
|
||||
}
|
||||
|
||||
void createCommentPending() {
|
||||
commentPending.reset();
|
||||
}
|
||||
|
||||
void emitCommentPending() {
|
||||
emit(commentPending);
|
||||
}
|
||||
|
||||
void createDoctypePending() {
|
||||
doctypePending.reset();
|
||||
}
|
||||
|
||||
void emitDoctypePending() {
|
||||
emit(doctypePending);
|
||||
}
|
||||
|
||||
void createTempBuffer() {
|
||||
Token.reset(dataBuffer);
|
||||
}
|
||||
|
||||
boolean isAppropriateEndTagToken() {
|
||||
return lastStartTag != null && tagPending.name().equalsIgnoreCase(lastStartTag);
|
||||
}
|
||||
|
||||
String appropriateEndTagName() {
|
||||
return lastStartTag; // could be null
|
||||
}
|
||||
|
||||
void error(TokeniserState state) {
|
||||
if (errors.canAddError())
|
||||
errors.add(new ParseError(reader.pos(), "Unexpected character '%s' in input state [%s]", reader.current(), state));
|
||||
}
|
||||
|
||||
void eofError(TokeniserState state) {
|
||||
if (errors.canAddError())
|
||||
errors.add(new ParseError(reader.pos(), "Unexpectedly reached end of file (EOF) in input state [%s]", state));
|
||||
}
|
||||
|
||||
private void characterReferenceError(String message) {
|
||||
if (errors.canAddError())
|
||||
errors.add(new ParseError(reader.pos(), "Invalid character reference: %s", message));
|
||||
}
|
||||
|
||||
void error(String errorMsg) {
|
||||
if (errors.canAddError())
|
||||
errors.add(new ParseError(reader.pos(), errorMsg));
|
||||
}
|
||||
|
||||
boolean currentNodeInHtmlNS() {
|
||||
// todo: implement namespaces correctly
|
||||
return true;
|
||||
// Element currentNode = currentNode();
|
||||
// return currentNode != null && currentNode.namespace().equals("HTML");
|
||||
}
|
||||
|
||||
// /**
|
||||
// * Utility method to consume reader and unescape entities found within.
|
||||
// * @param inAttribute if the text to be unescaped is in an attribute
|
||||
// * @return unescaped string from reader
|
||||
// */
|
||||
// String unescapeEntities(boolean inAttribute) {
|
||||
// StringBuilder builder = StringUtil.stringBuilder();
|
||||
// while (!reader.isEmpty()) {
|
||||
// builder.append(reader.consumeTo('&'));
|
||||
// if (reader.matches('&')) {
|
||||
// reader.consume();
|
||||
// int[] c = consumeCharacterReference(null, inAttribute);
|
||||
// if (c == null || c.length==0)
|
||||
// builder.append('&');
|
||||
// else {
|
||||
// builder.appendCodePoint(c[0]);
|
||||
// if (c.length == 2)
|
||||
// builder.appendCodePoint(c[1]);
|
||||
// }
|
||||
//
|
||||
// }
|
||||
// }
|
||||
// return builder.toString();
|
||||
// }
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -1 +1,2 @@
|
||||
include ':app', ':library', ':library-image-loader', ':library-view', ':sample-custom-extension', ':library-syntax'
|
||||
include ':app', ':library', ':library-image-loader', ':library-view', ':sample-custom-extension',
|
||||
':library-syntax', ':html-parser-api', ':html-parser-impl'
|
||||
|
Loading…
x
Reference in New Issue
Block a user