diff --git a/build.gradle b/build.gradle index c0b701ed..31ab153d 100644 --- a/build.gradle +++ b/build.gradle @@ -52,4 +52,7 @@ ext { PRISM_4J = 'ru.noties:prism4j:1.1.0' PRISM_4J_BUNDLER = 'ru.noties:prism4j-bundler:1.1.0' + + JUNIT = 'junit:junit:4.12' + ROBOLECTRIC = 'org.robolectric:robolectric:3.8' } diff --git a/html-parser-api/src/main/java/ru/noties/markwon/html/HtmlTag.java b/html-parser-api/src/main/java/ru/noties/markwon/html/HtmlTag.java index 896f0d40..153b4a77 100644 --- a/html-parser-api/src/main/java/ru/noties/markwon/html/HtmlTag.java +++ b/html-parser-api/src/main/java/ru/noties/markwon/html/HtmlTag.java @@ -27,6 +27,11 @@ public interface HtmlTag { */ int end(); + /** + * @return flag indicating if this tag has no content (when start == end) + */ + boolean isEmpty(); + /** * Represents really inline HTML tags (unline commonmark definitions) */ diff --git a/html-parser-impl/build.gradle b/html-parser-impl/build.gradle index f1c8860f..f8a446d5 100644 --- a/html-parser-impl/build.gradle +++ b/html-parser-impl/build.gradle @@ -14,8 +14,12 @@ android { } dependencies { + api SUPPORT_ANNOTATIONS api project(':html-parser-api') + + testImplementation JUNIT + testImplementation ROBOLECTRIC } afterEvaluate { diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/HtmlEmptyTagReplacement.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/HtmlEmptyTagReplacement.java new file mode 100644 index 00000000..75f1184b --- /dev/null +++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/HtmlEmptyTagReplacement.java @@ -0,0 +1,52 @@ +package ru.noties.markwon.html; + +import android.support.annotation.NonNull; +import android.support.annotation.Nullable; + +import ru.noties.markwon.html.jsoup.parser.Token; + +/** + * This class will be used to append some text to output in order to + * apply a Span for this tag. Please note that this class will be used for + * _void_ tags and tags that are self-closed (even if HTML spec doesn\'t specify + * a tag as self-closed). This is due to the fact that underlying parser does not + * validate context and does not check if a tag is correctly used. + */ +public class HtmlEmptyTagReplacement { + + @NonNull + public static HtmlEmptyTagReplacement create() { + return new HtmlEmptyTagReplacement(); + } + + private static final String IMG_REPLACEMENT = "\uFFFC"; + + /** + * @return replacement for supplied startTag or null if no replacement should occur (which will + * lead to `Inline` tag have start & end the same value, thus not applicable for applying a Span) + */ + @Nullable + public String replace(@NonNull Token.StartTag startTag) { + + final String replacement; + + final String name = startTag.normalName; + if ("br".equals(name)) { + replacement = "\n"; + } else if ("img".equals(name)) { + final String alt = startTag.attributes.getIgnoreCase("alt"); + if (alt == null + || alt.length() == 0) { + // no alt is provided + replacement = IMG_REPLACEMENT; + } else { + replacement = alt; + } + } else { + replacement = null; + } + + return replacement; + } + +} diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/HtmlTagImpl.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/HtmlTagImpl.java index 3f8083dd..4f6ca844 100644 --- a/html-parser-impl/src/main/java/ru/noties/markwon/html/HtmlTagImpl.java +++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/HtmlTagImpl.java @@ -8,7 +8,7 @@ import java.util.List; abstract class HtmlTagImpl implements HtmlTag { - static final int NO_VALUE = -1; + private static final int NO_VALUE = -1; final String name; final int start; @@ -35,13 +35,17 @@ abstract class HtmlTagImpl implements HtmlTag { return end; } + @Override + public boolean isEmpty() { + return start == end; + } + boolean isClosed() { return end > NO_VALUE; } abstract void closeAt(int end); - static class InlineImpl extends HtmlTagImpl implements Inline { InlineImpl(@NonNull String name, int start) { @@ -54,6 +58,15 @@ abstract class HtmlTagImpl implements HtmlTag { super.end = end; } } + + @Override + public String toString() { + return "InlineImpl{" + + "name='" + name + '\'' + + ", start=" + start + + ", end=" + end + + '}'; + } } static class BlockImpl extends HtmlTagImpl implements Block { @@ -83,7 +96,7 @@ abstract class HtmlTagImpl implements HtmlTag { if (!isClosed()) { super.end = end; if (children != null) { - for (BlockImpl child: children) { + for (BlockImpl child : children) { child.closeAt(end); } children = Collections.unmodifiableList(children); @@ -113,5 +126,16 @@ abstract class HtmlTagImpl implements HtmlTag { //noinspection unchecked return (List) (List) children; } + + @Override + public String toString() { + return "BlockImpl{" + + "name='" + name + '\'' + + ", start=" + start + + ", end=" + end + + ", parent=" + (parent != null ? parent.name : null) + + ", children=" + children + + '}'; + } } } diff --git a/html-parser-impl/src/main/java/ru/noties/markwon/html/MarkwonHtmlParserImpl.java b/html-parser-impl/src/main/java/ru/noties/markwon/html/MarkwonHtmlParserImpl.java index 4fcd4a5d..b6b731a8 100644 --- a/html-parser-impl/src/main/java/ru/noties/markwon/html/MarkwonHtmlParserImpl.java +++ b/html-parser-impl/src/main/java/ru/noties/markwon/html/MarkwonHtmlParserImpl.java @@ -24,7 +24,12 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser { @NonNull public static MarkwonHtmlParserImpl create() { - return new MarkwonHtmlParserImpl(); + return create(HtmlEmptyTagReplacement.create()); + } + + @NonNull + public static MarkwonHtmlParserImpl create(@NonNull HtmlEmptyTagReplacement inlineTagReplacement) { + return new MarkwonHtmlParserImpl(inlineTagReplacement); } // https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements @@ -43,7 +48,7 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser { private static final String TAG_LIST_ITEM = "li"; // todo: make it configurable - private static final String IMG_REPLACEMENT = "\uFFFC"; +// private static final String IMG_REPLACEMENT = "\uFFFC"; static { INLINE_TAGS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList( @@ -96,10 +101,16 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser { ))); } + private final HtmlEmptyTagReplacement emptyTagReplacement; + private final List inlineTags = new ArrayList<>(0); private BlockImpl currentBlock = BlockImpl.root(); + MarkwonHtmlParserImpl(@NonNull HtmlEmptyTagReplacement replacement) { + this.emptyTagReplacement = replacement; + } + @Override public void processFragment( @@ -203,17 +214,19 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser { if (isVoidTag(name) || startTag.selfClosing) { - // check if we have content to append as we must close this tag here - processVoidTag(output, startTag); + final String replacement = emptyTagReplacement.replace(startTag); + if (replacement != null + && replacement.length() > 0) { + append(output, replacement); + } + // the thing is: we will keep this inline tag in the list, + // but in case of void-tag that has no replacement, there will be no + // possibility to set a span (requires at least one char) inline.closeAt(output.length()); } - // actually only check if there is content for void/self-closing tags - // if none -> ignore it - if (inline.start != inline.end) { - inlineTags.add(inline); - } + inlineTags.add(inline); } protected void processInlineTagEnd( @@ -236,16 +249,14 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser { final String name = startTag.normalName; // block tags (all that are NOT inline -> blocks - // I think there is only one strong rule -> paragraph cannot contain anything + // there is only one strong rule -> paragraph cannot contain anything // except inline tags - // also, closing paragraph with non-closed inlines -> doesn't close inlines - // they are continued for _afterwards_ if (TAG_PARAGRAPH.equals(currentBlock.name)) { // it must be closed here not matter what we are as here we _assume_ // that it's a block tag - append(output, "\n"); currentBlock.closeAt(output.length()); + append(output, "\n"); currentBlock = currentBlock.parent; } else if (TAG_LIST_ITEM.equals(name) && TAG_LIST_ITEM.equals(currentBlock.name)) { @@ -262,10 +273,23 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser { final BlockImpl block = BlockImpl.create(name, start, currentBlock); + final boolean isVoid = isVoidTag(name) || startTag.selfClosing; + if (isVoid) { + final String replacement = emptyTagReplacement.replace(startTag); + if (replacement != null + && replacement.length() > 0) { + append(output, replacement); + } + block.closeAt(output.length()); + } + //noinspection ConstantConditions appendBlockChild(block.parent, block); - this.currentBlock = block; + // if not void start filling-in children + if (!isVoid) { + this.currentBlock = block; + } } protected void processBlockTagEnd( @@ -277,37 +301,16 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser { final BlockImpl block = findOpenBlockTag(endTag.normalName); if (block != null) { + block.closeAt(output.length()); + if (TAG_PARAGRAPH.equals(name)) { append(output, "\n"); } - block.closeAt(output.length()); this.currentBlock = block.parent; } } - protected void processVoidTag( - @NonNull T output, - @NonNull Token.StartTag startTag) { - - final String name = startTag.normalName; - - if ("br".equals(name)) { - append(output, "\n"); - } else if ("img".equals(name)) { - final String alt = startTag.attributes.getIgnoreCase("alt"); - if (alt == null - || alt.length() == 0) { - // no alt is provided - append(output, IMG_REPLACEMENT); - } else { - append(output, alt); - } - } - - // other tags are ignored - } - protected void processCharacter( @NonNull T output, @NonNull Token.Character character) { diff --git a/html-parser-impl/src/test/java/ru/noties/markwon/html/MarkwonHtmlParserImplTest.java b/html-parser-impl/src/test/java/ru/noties/markwon/html/MarkwonHtmlParserImplTest.java new file mode 100644 index 00000000..f61d697b --- /dev/null +++ b/html-parser-impl/src/test/java/ru/noties/markwon/html/MarkwonHtmlParserImplTest.java @@ -0,0 +1,412 @@ +package ru.noties.markwon.html; + +import android.support.annotation.NonNull; +import android.support.annotation.Nullable; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.robolectric.RobolectricTestRunner; +import org.robolectric.annotation.Config; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import ru.noties.markwon.html.jsoup.parser.Token; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +@RunWith(RobolectricTestRunner.class) +@Config(manifest = Config.NONE) +public class MarkwonHtmlParserImplTest { + + @Test + public void inlineTags() { + + // all inline tags are correctly parsed + + // a simple replacement that will return tag name as replacement (for this test purposes) + final MarkwonHtmlParserImpl impl = new MarkwonHtmlParserImpl(new HtmlEmptyTagReplacement() { + @Nullable + @Override + public String replace(@NonNull Token.StartTag startTag) { + return startTag.normalName; + } + }); + + // all inline tags are parsed as ones + final List tags = Arrays.asList( + "a", "abbr", "acronym", + "b", "bdo", "big", "br", "button", + "cite", "code", + "dfn", + "em", + "i", "img", "input", + "kbd", + "label", + "map", + "object", + "q", + "samp", "script", "select", "small", "span", "strong", "sub", "sup", + "textarea", "time", "tt", + "var" + ); + + final StringBuilder html = new StringBuilder(); + for (String tag : tags) { + html.append('<') + .append(tag) + .append('>') + .append(tag) + .append("'); + } + + final StringBuilder output = new StringBuilder(); + + impl.processFragment(output, html.toString()); + + final CaptureInlineTagsAction action = new CaptureInlineTagsAction(); + + impl.flushInlineTags(output.length(), action); + + assertTrue(action.called); + + final List inlines = action.tags; + + if (tags.size() != inlines.size()) { + final Set missing = new HashSet<>(tags); + for (HtmlTag.Inline inline : inlines) { + missing.remove(inline.name()); + } + assertTrue("Missing inline tags: " + missing, false); + } + + final Set set = new HashSet<>(tags); + + for (HtmlTag.Inline inline : inlines) { + assertTrue(set.remove(inline.name())); + assertEquals(inline.name(), output.substring(inline.start(), inline.end())); + } + + assertEquals(0, set.size()); + } + + @Test + public void inlineVoidTags() { + + // all inline void tags are correctly parsed + + final List tags = Arrays.asList( + "br", + "img", "input" + ); + + final MarkwonHtmlParserImpl impl = new MarkwonHtmlParserImpl(new HtmlEmptyTagReplacement() { + @Nullable + @Override + public String replace(@NonNull Token.StartTag startTag) { + return null; + } + }); + + final StringBuilder html = new StringBuilder(); + for (String tag : tags) { + html.append('<') + .append(tag) + .append('>'); + } + + final StringBuilder output = new StringBuilder(); + + impl.processFragment(output, html.toString()); + + assertEquals(0, output.length()); + + final CaptureInlineTagsAction action = new CaptureInlineTagsAction(); + + impl.flushInlineTags(output.length(), action); + + assertTrue(action.called); + + final List inlines = action.tags; + + assertEquals(inlines.toString(), tags.size(), inlines.size()); + + final Set set = new HashSet<>(tags); + + for (HtmlTag.Inline inline : inlines) { + assertEquals(inline.name(), inline.start(), inline.end()); + assertTrue(inline.name(), inline.isEmpty()); + assertTrue(set.remove(inline.name())); + } + + assertEquals(set.toString(), 0, set.size()); + } + + @Test + public void blockVoidTags() { + + final MarkwonHtmlParserImpl impl = new MarkwonHtmlParserImpl(new HtmlEmptyTagReplacement() { + @Nullable + @Override + public String replace(@NonNull Token.StartTag startTag) { + return null; + } + }); + + final List tags = Arrays.asList( + "area", + "base", + "col", + "embed", + "hr", + "keygen", + "link", + "meta", + "param", + "source", + "track", + "wbr" + ); + + final StringBuilder html = new StringBuilder(); + for (String tag : tags) { + html.append('<') + .append(tag) + .append('>'); + } + + final StringBuilder output = new StringBuilder(); + + impl.processFragment(output, html.toString()); + + assertEquals(0, output.length()); + + final CaptureBlockTagsAction action = new CaptureBlockTagsAction(); + impl.flushBlockTags(output.length(), action); + + assertTrue(action.called); + + final List blocks = action.tags; + + assertEquals(blocks.toString(), tags.size(), blocks.size()); + + final Set set = new HashSet<>(tags); + + for (HtmlTag.Block block : blocks) { + assertEquals(block.name(), block.start(), block.end()); + assertTrue(block.name(), block.isEmpty()); + assertTrue(set.remove(block.name())); + } + + assertEquals(set.toString(), 0, set.size()); + } + + @Test + public void selfClosingTags() { + + // self-closing tags (grammatically) must be replaced (no checks for real html) + + final List tags = Arrays.asList( + "one", + "two-two", + "three-three-three", + "four-four-four-four", + "FiveFiveFiveFiveFive" + ); + + final MarkwonHtmlParserImpl impl = new MarkwonHtmlParserImpl(new HtmlEmptyTagReplacement() { + @Nullable + @Override + public String replace(@NonNull Token.StartTag startTag) { + return null; + } + }); + + final StringBuilder html = new StringBuilder(); + for (String tag : tags) { + html.append('<') + .append(tag) + .append(" />"); + } + + final StringBuilder output = new StringBuilder(); + + impl.processFragment(output, html.toString()); + + assertEquals(output.toString(), 0, output.length()); + + final CaptureBlockTagsAction action = new CaptureBlockTagsAction(); + + impl.flushBlockTags(output.length(), action); + + assertTrue(action.called); + + final List blocks = action.tags; + + assertEquals(blocks.toString(), tags.size(), blocks.size()); + + // tag names must be lower cased + final Set set = new HashSet<>(tags.size()); + for (String tag: tags) { + set.add(tag.toLowerCase()); + } + + for (HtmlTag.Block block: blocks) { + assertTrue(block.name(), block.isEmpty()); + assertTrue(set.remove(block.name())); + } + + assertEquals(set.toString(), 0, set.size()); + } + + @Test + public void blockTags() { + + // the tags that will require a new line before them + + final List tags = Arrays.asList( + "address", "article", "aside", + "blockquote", + "canvas", + "dd", "div", "dl", "dt", + "fieldset", "figcaption", "figure", "footer", "form", + "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr", + "li", + "main", + "nav", "noscript", + "ol", "output", + "p", "pre", + "section", + "table", "tfoot", + "ul", + "video" + ); + + final MarkwonHtmlParserImpl impl = new MarkwonHtmlParserImpl(new HtmlEmptyTagReplacement() { + @Nullable + @Override + public String replace(@NonNull Token.StartTag startTag) { + return startTag.normalName; + } + }); + + final StringBuilder html = new StringBuilder(); + for (String tag: tags) { + html.append('<') + .append(tag) + .append('>') + .append(tag) + .append("'); + } + + final StringBuilder output = new StringBuilder(); + + impl.processFragment(output, html.toString()); + + final CaptureBlockTagsAction action = new CaptureBlockTagsAction(); + + impl.flushBlockTags(output.length(), action); + + assertTrue(action.called); + + final List blocks = action.tags; + assertEquals(blocks.toString(), tags.size(), blocks.size()); + + final Set set = new HashSet<>(tags); + + boolean first = true; + for (HtmlTag.Block block: blocks) { + assertEquals(block.name(), block.name(), output.substring(block.start(), block.end())); + if (first) { + first = false; + } else { + assertEquals('\n', output.charAt(block.start() - 1)); + } + assertTrue(set.remove(block.name())); + } + + assertEquals(set.toString(), 0, set.size()); + } + + @Test + public void multipleFragmentsContinuation() { + throw new RuntimeException(); + } + + @Test + public void paragraphCannotContainAnythingButInlines() { + throw new RuntimeException(); + } + + // move to htmlInlineTagreplacement test class + @Test + public void imageReplacementNoAlt() { + throw new RuntimeException(); + } + + @Test + public void brAddsNewLine() { + throw new RuntimeException(); + } + + @Test + public void imageReplacementAlt() { + throw new RuntimeException(); + } + + @Test + public void blockCloseClosesChildren() { + throw new RuntimeException(); + } + + @Test + public void allReturnedTagsAreClosed() { + throw new RuntimeException(); + } + + @Test + public void allTagsAreLowerCase() { + throw new RuntimeException(); + } + + @Test + public void previousListItemClosed() { + throw new RuntimeException(); + } + + @Test + public void nestedBlocks() { + throw new RuntimeException(); + } + + @Test + public void attributes() { + throw new RuntimeException(); + } + + private static class CaptureTagsAction implements MarkwonHtmlParser.FlushAction { + + boolean called; + List tags; + + @Override + public void apply(@NonNull List tags) { + this.called = true; + this.tags = new ArrayList<>(tags); + } + } + + private static class CaptureInlineTagsAction extends CaptureTagsAction { + } + + private static class CaptureBlockTagsAction extends CaptureTagsAction { + } +} \ No newline at end of file