Add HtmlEmptyTagReplacement abstraction for empty tags

This commit is contained in:
Dimitry Ivanov 2018-08-17 15:44:33 +03:00
parent 97a25ecc14
commit bf8ff03b1c
7 changed files with 543 additions and 40 deletions

View File

@ -52,4 +52,7 @@ ext {
PRISM_4J = 'ru.noties:prism4j:1.1.0'
PRISM_4J_BUNDLER = 'ru.noties:prism4j-bundler:1.1.0'
JUNIT = 'junit:junit:4.12'
ROBOLECTRIC = 'org.robolectric:robolectric:3.8'
}

View File

@ -27,6 +27,11 @@ public interface HtmlTag {
*/
int end();
/**
* @return flag indicating if this tag has no content (when start == end)
*/
boolean isEmpty();
/**
* Represents <em>really</em> inline HTML tags (unline commonmark definitions)
*/

View File

@ -14,8 +14,12 @@ android {
}
dependencies {
api SUPPORT_ANNOTATIONS
api project(':html-parser-api')
testImplementation JUNIT
testImplementation ROBOLECTRIC
}
afterEvaluate {

View File

@ -0,0 +1,52 @@
package ru.noties.markwon.html;
import android.support.annotation.NonNull;
import android.support.annotation.Nullable;
import ru.noties.markwon.html.jsoup.parser.Token;
/**
* This class will be used to append some text to output in order to
* apply a Span for this tag. Please note that this class will be used for
* _void_ tags and tags that are self-closed (even if HTML spec doesn\'t specify
* a tag as self-closed). This is due to the fact that underlying parser does not
* validate context and does not check if a tag is correctly used.
*/
public class HtmlEmptyTagReplacement {
@NonNull
public static HtmlEmptyTagReplacement create() {
return new HtmlEmptyTagReplacement();
}
private static final String IMG_REPLACEMENT = "\uFFFC";
/**
* @return replacement for supplied startTag or null if no replacement should occur (which will
* lead to `Inline` tag have start & end the same value, thus not applicable for applying a Span)
*/
@Nullable
public String replace(@NonNull Token.StartTag startTag) {
final String replacement;
final String name = startTag.normalName;
if ("br".equals(name)) {
replacement = "\n";
} else if ("img".equals(name)) {
final String alt = startTag.attributes.getIgnoreCase("alt");
if (alt == null
|| alt.length() == 0) {
// no alt is provided
replacement = IMG_REPLACEMENT;
} else {
replacement = alt;
}
} else {
replacement = null;
}
return replacement;
}
}

View File

@ -8,7 +8,7 @@ import java.util.List;
abstract class HtmlTagImpl implements HtmlTag {
static final int NO_VALUE = -1;
private static final int NO_VALUE = -1;
final String name;
final int start;
@ -35,13 +35,17 @@ abstract class HtmlTagImpl implements HtmlTag {
return end;
}
@Override
public boolean isEmpty() {
return start == end;
}
boolean isClosed() {
return end > NO_VALUE;
}
abstract void closeAt(int end);
static class InlineImpl extends HtmlTagImpl implements Inline {
InlineImpl(@NonNull String name, int start) {
@ -54,6 +58,15 @@ abstract class HtmlTagImpl implements HtmlTag {
super.end = end;
}
}
@Override
public String toString() {
return "InlineImpl{" +
"name='" + name + '\'' +
", start=" + start +
", end=" + end +
'}';
}
}
static class BlockImpl extends HtmlTagImpl implements Block {
@ -83,7 +96,7 @@ abstract class HtmlTagImpl implements HtmlTag {
if (!isClosed()) {
super.end = end;
if (children != null) {
for (BlockImpl child: children) {
for (BlockImpl child : children) {
child.closeAt(end);
}
children = Collections.unmodifiableList(children);
@ -113,5 +126,16 @@ abstract class HtmlTagImpl implements HtmlTag {
//noinspection unchecked
return (List<Block>) (List<? extends Block>) children;
}
@Override
public String toString() {
return "BlockImpl{" +
"name='" + name + '\'' +
", start=" + start +
", end=" + end +
", parent=" + (parent != null ? parent.name : null) +
", children=" + children +
'}';
}
}
}

View File

@ -24,7 +24,12 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
@NonNull
public static MarkwonHtmlParserImpl create() {
return new MarkwonHtmlParserImpl();
return create(HtmlEmptyTagReplacement.create());
}
@NonNull
public static MarkwonHtmlParserImpl create(@NonNull HtmlEmptyTagReplacement inlineTagReplacement) {
return new MarkwonHtmlParserImpl(inlineTagReplacement);
}
// https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements
@ -43,7 +48,7 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
private static final String TAG_LIST_ITEM = "li";
// todo: make it configurable
private static final String IMG_REPLACEMENT = "\uFFFC";
// private static final String IMG_REPLACEMENT = "\uFFFC";
static {
INLINE_TAGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
@ -96,10 +101,16 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
)));
}
private final HtmlEmptyTagReplacement emptyTagReplacement;
private final List<InlineImpl> inlineTags = new ArrayList<>(0);
private BlockImpl currentBlock = BlockImpl.root();
MarkwonHtmlParserImpl(@NonNull HtmlEmptyTagReplacement replacement) {
this.emptyTagReplacement = replacement;
}
@Override
public <T extends Appendable & CharSequence> void processFragment(
@ -203,17 +214,19 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
if (isVoidTag(name)
|| startTag.selfClosing) {
// check if we have content to append as we must close this tag here
processVoidTag(output, startTag);
final String replacement = emptyTagReplacement.replace(startTag);
if (replacement != null
&& replacement.length() > 0) {
append(output, replacement);
}
// the thing is: we will keep this inline tag in the list,
// but in case of void-tag that has no replacement, there will be no
// possibility to set a span (requires at least one char)
inline.closeAt(output.length());
}
// actually only check if there is content for void/self-closing tags
// if none -> ignore it
if (inline.start != inline.end) {
inlineTags.add(inline);
}
inlineTags.add(inline);
}
protected <T extends Appendable & CharSequence> void processInlineTagEnd(
@ -236,16 +249,14 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
final String name = startTag.normalName;
// block tags (all that are NOT inline -> blocks
// I think there is only one strong rule -> paragraph cannot contain anything
// there is only one strong rule -> paragraph cannot contain anything
// except inline tags
// also, closing paragraph with non-closed inlines -> doesn't close inlines
// they are continued for _afterwards_
if (TAG_PARAGRAPH.equals(currentBlock.name)) {
// it must be closed here not matter what we are as here we _assume_
// that it's a block tag
append(output, "\n");
currentBlock.closeAt(output.length());
append(output, "\n");
currentBlock = currentBlock.parent;
} else if (TAG_LIST_ITEM.equals(name)
&& TAG_LIST_ITEM.equals(currentBlock.name)) {
@ -262,10 +273,23 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
final BlockImpl block = BlockImpl.create(name, start, currentBlock);
final boolean isVoid = isVoidTag(name) || startTag.selfClosing;
if (isVoid) {
final String replacement = emptyTagReplacement.replace(startTag);
if (replacement != null
&& replacement.length() > 0) {
append(output, replacement);
}
block.closeAt(output.length());
}
//noinspection ConstantConditions
appendBlockChild(block.parent, block);
this.currentBlock = block;
// if not void start filling-in children
if (!isVoid) {
this.currentBlock = block;
}
}
protected <T extends Appendable & CharSequence> void processBlockTagEnd(
@ -277,37 +301,16 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
final BlockImpl block = findOpenBlockTag(endTag.normalName);
if (block != null) {
block.closeAt(output.length());
if (TAG_PARAGRAPH.equals(name)) {
append(output, "\n");
}
block.closeAt(output.length());
this.currentBlock = block.parent;
}
}
protected <T extends Appendable & CharSequence> void processVoidTag(
@NonNull T output,
@NonNull Token.StartTag startTag) {
final String name = startTag.normalName;
if ("br".equals(name)) {
append(output, "\n");
} else if ("img".equals(name)) {
final String alt = startTag.attributes.getIgnoreCase("alt");
if (alt == null
|| alt.length() == 0) {
// no alt is provided
append(output, IMG_REPLACEMENT);
} else {
append(output, alt);
}
}
// other tags are ignored
}
protected <T extends Appendable & CharSequence> void processCharacter(
@NonNull T output,
@NonNull Token.Character character) {

View File

@ -0,0 +1,412 @@
package ru.noties.markwon.html;
import android.support.annotation.NonNull;
import android.support.annotation.Nullable;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.robolectric.RobolectricTestRunner;
import org.robolectric.annotation.Config;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import ru.noties.markwon.html.jsoup.parser.Token;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
@RunWith(RobolectricTestRunner.class)
@Config(manifest = Config.NONE)
public class MarkwonHtmlParserImplTest {
@Test
public void inlineTags() {
// all inline tags are correctly parsed
// a simple replacement that will return tag name as replacement (for this test purposes)
final MarkwonHtmlParserImpl impl = new MarkwonHtmlParserImpl(new HtmlEmptyTagReplacement() {
@Nullable
@Override
public String replace(@NonNull Token.StartTag startTag) {
return startTag.normalName;
}
});
// all inline tags are parsed as ones
final List<String> tags = Arrays.asList(
"a", "abbr", "acronym",
"b", "bdo", "big", "br", "button",
"cite", "code",
"dfn",
"em",
"i", "img", "input",
"kbd",
"label",
"map",
"object",
"q",
"samp", "script", "select", "small", "span", "strong", "sub", "sup",
"textarea", "time", "tt",
"var"
);
final StringBuilder html = new StringBuilder();
for (String tag : tags) {
html.append('<')
.append(tag)
.append('>')
.append(tag)
.append("</")
.append(tag)
.append('>');
}
final StringBuilder output = new StringBuilder();
impl.processFragment(output, html.toString());
final CaptureInlineTagsAction action = new CaptureInlineTagsAction();
impl.flushInlineTags(output.length(), action);
assertTrue(action.called);
final List<HtmlTag.Inline> inlines = action.tags;
if (tags.size() != inlines.size()) {
final Set<String> missing = new HashSet<>(tags);
for (HtmlTag.Inline inline : inlines) {
missing.remove(inline.name());
}
assertTrue("Missing inline tags: " + missing, false);
}
final Set<String> set = new HashSet<>(tags);
for (HtmlTag.Inline inline : inlines) {
assertTrue(set.remove(inline.name()));
assertEquals(inline.name(), output.substring(inline.start(), inline.end()));
}
assertEquals(0, set.size());
}
@Test
public void inlineVoidTags() {
// all inline void tags are correctly parsed
final List<String> tags = Arrays.asList(
"br",
"img", "input"
);
final MarkwonHtmlParserImpl impl = new MarkwonHtmlParserImpl(new HtmlEmptyTagReplacement() {
@Nullable
@Override
public String replace(@NonNull Token.StartTag startTag) {
return null;
}
});
final StringBuilder html = new StringBuilder();
for (String tag : tags) {
html.append('<')
.append(tag)
.append('>');
}
final StringBuilder output = new StringBuilder();
impl.processFragment(output, html.toString());
assertEquals(0, output.length());
final CaptureInlineTagsAction action = new CaptureInlineTagsAction();
impl.flushInlineTags(output.length(), action);
assertTrue(action.called);
final List<HtmlTag.Inline> inlines = action.tags;
assertEquals(inlines.toString(), tags.size(), inlines.size());
final Set<String> set = new HashSet<>(tags);
for (HtmlTag.Inline inline : inlines) {
assertEquals(inline.name(), inline.start(), inline.end());
assertTrue(inline.name(), inline.isEmpty());
assertTrue(set.remove(inline.name()));
}
assertEquals(set.toString(), 0, set.size());
}
@Test
public void blockVoidTags() {
final MarkwonHtmlParserImpl impl = new MarkwonHtmlParserImpl(new HtmlEmptyTagReplacement() {
@Nullable
@Override
public String replace(@NonNull Token.StartTag startTag) {
return null;
}
});
final List<String> tags = Arrays.asList(
"area",
"base",
"col",
"embed",
"hr",
"keygen",
"link",
"meta",
"param",
"source",
"track",
"wbr"
);
final StringBuilder html = new StringBuilder();
for (String tag : tags) {
html.append('<')
.append(tag)
.append('>');
}
final StringBuilder output = new StringBuilder();
impl.processFragment(output, html.toString());
assertEquals(0, output.length());
final CaptureBlockTagsAction action = new CaptureBlockTagsAction();
impl.flushBlockTags(output.length(), action);
assertTrue(action.called);
final List<HtmlTag.Block> blocks = action.tags;
assertEquals(blocks.toString(), tags.size(), blocks.size());
final Set<String> set = new HashSet<>(tags);
for (HtmlTag.Block block : blocks) {
assertEquals(block.name(), block.start(), block.end());
assertTrue(block.name(), block.isEmpty());
assertTrue(set.remove(block.name()));
}
assertEquals(set.toString(), 0, set.size());
}
@Test
public void selfClosingTags() {
// self-closing tags (grammatically) must be replaced (no checks for real html)
final List<String> tags = Arrays.asList(
"one",
"two-two",
"three-three-three",
"four-four-four-four",
"FiveFiveFiveFiveFive"
);
final MarkwonHtmlParserImpl impl = new MarkwonHtmlParserImpl(new HtmlEmptyTagReplacement() {
@Nullable
@Override
public String replace(@NonNull Token.StartTag startTag) {
return null;
}
});
final StringBuilder html = new StringBuilder();
for (String tag : tags) {
html.append('<')
.append(tag)
.append(" />");
}
final StringBuilder output = new StringBuilder();
impl.processFragment(output, html.toString());
assertEquals(output.toString(), 0, output.length());
final CaptureBlockTagsAction action = new CaptureBlockTagsAction();
impl.flushBlockTags(output.length(), action);
assertTrue(action.called);
final List<HtmlTag.Block> blocks = action.tags;
assertEquals(blocks.toString(), tags.size(), blocks.size());
// tag names must be lower cased
final Set<String> set = new HashSet<>(tags.size());
for (String tag: tags) {
set.add(tag.toLowerCase());
}
for (HtmlTag.Block block: blocks) {
assertTrue(block.name(), block.isEmpty());
assertTrue(set.remove(block.name()));
}
assertEquals(set.toString(), 0, set.size());
}
@Test
public void blockTags() {
// the tags that will require a new line before them
final List<String> tags = Arrays.asList(
"address", "article", "aside",
"blockquote",
"canvas",
"dd", "div", "dl", "dt",
"fieldset", "figcaption", "figure", "footer", "form",
"h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr",
"li",
"main",
"nav", "noscript",
"ol", "output",
"p", "pre",
"section",
"table", "tfoot",
"ul",
"video"
);
final MarkwonHtmlParserImpl impl = new MarkwonHtmlParserImpl(new HtmlEmptyTagReplacement() {
@Nullable
@Override
public String replace(@NonNull Token.StartTag startTag) {
return startTag.normalName;
}
});
final StringBuilder html = new StringBuilder();
for (String tag: tags) {
html.append('<')
.append(tag)
.append('>')
.append(tag)
.append("</")
.append(tag)
.append('>');
}
final StringBuilder output = new StringBuilder();
impl.processFragment(output, html.toString());
final CaptureBlockTagsAction action = new CaptureBlockTagsAction();
impl.flushBlockTags(output.length(), action);
assertTrue(action.called);
final List<HtmlTag.Block> blocks = action.tags;
assertEquals(blocks.toString(), tags.size(), blocks.size());
final Set<String> set = new HashSet<>(tags);
boolean first = true;
for (HtmlTag.Block block: blocks) {
assertEquals(block.name(), block.name(), output.substring(block.start(), block.end()));
if (first) {
first = false;
} else {
assertEquals('\n', output.charAt(block.start() - 1));
}
assertTrue(set.remove(block.name()));
}
assertEquals(set.toString(), 0, set.size());
}
@Test
public void multipleFragmentsContinuation() {
throw new RuntimeException();
}
@Test
public void paragraphCannotContainAnythingButInlines() {
throw new RuntimeException();
}
// move to htmlInlineTagreplacement test class
@Test
public void imageReplacementNoAlt() {
throw new RuntimeException();
}
@Test
public void brAddsNewLine() {
throw new RuntimeException();
}
@Test
public void imageReplacementAlt() {
throw new RuntimeException();
}
@Test
public void blockCloseClosesChildren() {
throw new RuntimeException();
}
@Test
public void allReturnedTagsAreClosed() {
throw new RuntimeException();
}
@Test
public void allTagsAreLowerCase() {
throw new RuntimeException();
}
@Test
public void previousListItemClosed() {
throw new RuntimeException();
}
@Test
public void nestedBlocks() {
throw new RuntimeException();
}
@Test
public void attributes() {
throw new RuntimeException();
}
private static class CaptureTagsAction<T> implements MarkwonHtmlParser.FlushAction<T> {
boolean called;
List<T> tags;
@Override
public void apply(@NonNull List<T> tags) {
this.called = true;
this.tags = new ArrayList<>(tags);
}
}
private static class CaptureInlineTagsAction extends CaptureTagsAction<HtmlTag.Inline> {
}
private static class CaptureBlockTagsAction extends CaptureTagsAction<HtmlTag.Block> {
}
}