From c7c998db8f14f597d4291d6e08327a286619e8ef Mon Sep 17 00:00:00 2001 From: Dimitry Ivanov Date: Sun, 19 Aug 2018 18:46:55 +0300 Subject: [PATCH] Improve empty tag replacement --- .../html/api/MarkwonHtmlParserNoOp.java | 4 ++ .../html/impl/HtmlEmptyTagReplacement.java | 13 ++--- .../html/impl/MarkwonHtmlParserImpl.java | 40 +++++++++++++-- .../html/impl/jsoup/nodes/Attributes.java | 33 ++++++------ .../impl/jsoup/parser/CharacterReader.java | 48 +++++++++++------ .../impl/HtmlEmptyTagReplacementTest.java | 31 ++++++----- .../html/impl/MarkwonHtmlParserImplTest.java | 51 +++++++++++++++---- .../ru/noties/markwon/SpannableBuilder.java | 7 ++- .../renderer/html2/tag/TagHandler.java | 5 ++ 9 files changed, 163 insertions(+), 69 deletions(-) diff --git a/markwon-html-parser-api/src/main/java/ru/noties/markwon/html/api/MarkwonHtmlParserNoOp.java b/markwon-html-parser-api/src/main/java/ru/noties/markwon/html/api/MarkwonHtmlParserNoOp.java index fb55c5f8..f50a8941 100644 --- a/markwon-html-parser-api/src/main/java/ru/noties/markwon/html/api/MarkwonHtmlParserNoOp.java +++ b/markwon-html-parser-api/src/main/java/ru/noties/markwon/html/api/MarkwonHtmlParserNoOp.java @@ -2,6 +2,10 @@ package ru.noties.markwon.html.api; import android.support.annotation.NonNull; +/** + * @see MarkwonHtmlParser + * @since 2.0.0 + */ class MarkwonHtmlParserNoOp extends MarkwonHtmlParser { @Override diff --git a/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/HtmlEmptyTagReplacement.java b/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/HtmlEmptyTagReplacement.java index 6e852b1c..c0d304dc 100644 --- a/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/HtmlEmptyTagReplacement.java +++ b/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/HtmlEmptyTagReplacement.java @@ -3,14 +3,15 @@ package ru.noties.markwon.html.impl; import android.support.annotation.NonNull; import android.support.annotation.Nullable; -import ru.noties.markwon.html.impl.jsoup.parser.Token; +import ru.noties.markwon.html.api.HtmlTag; /** * This class will be used to append some text to output in order to * apply a Span for this tag. Please note that this class will be used for * _void_ tags and tags that are self-closed (even if HTML spec doesn\'t specify * a tag as self-closed). This is due to the fact that underlying parser does not - * validate context and does not check if a tag is correctly used. + * validate context and does not check if a tag is correctly used. Plus it will be + * used for tags without content, for example: {@code } * * @since 2.0.0 */ @@ -28,15 +29,16 @@ public class HtmlEmptyTagReplacement { * lead to `Inline` tag have start & end the same value, thus not applicable for applying a Span) */ @Nullable - public String replace(@NonNull Token.StartTag startTag) { + public String replace(@NonNull HtmlTag tag) { final String replacement; - final String name = startTag.normalName; + final String name = tag.name(); + if ("br".equals(name)) { replacement = "\n"; } else if ("img".equals(name)) { - final String alt = startTag.attributes.getIgnoreCase("alt"); + final String alt = tag.attributes().get("alt"); if (alt == null || alt.length() == 0) { // no alt is provided @@ -50,5 +52,4 @@ public class HtmlEmptyTagReplacement { return replacement; } - } diff --git a/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/MarkwonHtmlParserImpl.java b/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/MarkwonHtmlParserImpl.java index 7ac819f9..791147d2 100644 --- a/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/MarkwonHtmlParserImpl.java +++ b/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/MarkwonHtmlParserImpl.java @@ -119,6 +119,11 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser { private boolean isInsidePreTag; + private Tokeniser tokeniser; + + private CharacterReader reader; + + MarkwonHtmlParserImpl( @NonNull HtmlEmptyTagReplacement replacement, @NonNull TrimmingAppender trimmingAppender) { @@ -126,13 +131,14 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser { this.trimmingAppender = trimmingAppender; } - @Override public void processFragment( @NonNull T output, @NonNull String htmlFragment) { - // todo: maybe there is a way to reuse tokeniser... + // we might want to reuse tokeniser (at least when the same output is involved) + // as CharacterReader does a bit of initialization (cache etc) as it's + // primary usage is parsing a document in one run (not parsing _fragments_) final Tokeniser tokeniser = new Tokeniser(new CharacterReader(htmlFragment), ParseErrorList.noTracking()); while (true) { @@ -239,7 +245,7 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser { if (isVoidTag(name) || startTag.selfClosing) { - final String replacement = emptyTagReplacement.replace(startTag); + final String replacement = emptyTagReplacement.replace(inline); if (replacement != null && replacement.length() > 0) { appendQuietly(output, replacement); @@ -261,6 +267,12 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser { // try to find it, if none found -> ignore final HtmlTagImpl.InlineImpl openInline = findOpenInlineTag(endTag.normalName); if (openInline != null) { + + // okay, if this tag is empty -> call replacement + if (isEmpty(output, openInline)) { + appendEmptyTagReplacement(output, openInline); + } + // close open inline tag openInline.closeAt(output.length()); } @@ -301,7 +313,7 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser { final boolean isVoid = isVoidTag(name) || startTag.selfClosing; if (isVoid) { - final String replacement = emptyTagReplacement.replace(startTag); + final String replacement = emptyTagReplacement.replace(block); if (replacement != null && replacement.length() > 0) { appendQuietly(output, replacement); @@ -331,6 +343,11 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser { isInsidePreTag = false; } + // okay, if this tag is empty -> call replacement + if (isEmpty(output, block)) { + appendEmptyTagReplacement(output, block); + } + block.closeAt(output.length()); if (TAG_PARAGRAPH.equals(name)) { @@ -434,4 +451,19 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser { return map; } + + protected static boolean isEmpty( + @NonNull T output, + @NonNull HtmlTagImpl tag) { + return tag.start == output.length(); + } + + protected void appendEmptyTagReplacement( + @NonNull T output, + @NonNull HtmlTagImpl tag) { + final String replacement = emptyTagReplacement.replace(tag); + if (replacement != null) { + appendQuietly(output, replacement); + } + } } diff --git a/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/jsoup/nodes/Attributes.java b/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/jsoup/nodes/Attributes.java index 9735e8f4..71494812 100644 --- a/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/jsoup/nodes/Attributes.java +++ b/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/jsoup/nodes/Attributes.java @@ -250,7 +250,8 @@ public class Attributes implements Iterable, Cloneable { @Override public Attribute next() { - final Attribute attr = new Attribute(keys[i], vals[i], Attributes.this); + final String val = vals[i]; + final Attribute attr = new Attribute(keys[i], val == null ? "" : val, Attributes.this); i++; return attr; } @@ -262,21 +263,21 @@ public class Attributes implements Iterable, Cloneable { }; } - /** - Get the attributes as a List, for iteration. - @return an view of the attributes as an unmodifialbe List. - */ - public List asList() { - ArrayList list = new ArrayList<>(size); - for (int i = 0; i < size; i++) { -// Attribute attr = vals[i] == null ? -// new BooleanAttribute(keys[i]) : // deprecated class, but maybe someone still wants it -// new Attribute(keys[i], vals[i], Attributes.this); -// list.add(attr); - list.add(new Attribute(keys[i], vals[i], Attributes.this)); - } - return Collections.unmodifiableList(list); - } +// /** +// Get the attributes as a List, for iteration. +// @return an view of the attributes as an unmodifialbe List. +// */ +// public List asList() { +// ArrayList list = new ArrayList<>(size); +// for (int i = 0; i < size; i++) { +//// Attribute attr = vals[i] == null ? +//// new BooleanAttribute(keys[i]) : // deprecated class, but maybe someone still wants it +//// new Attribute(keys[i], vals[i], Attributes.this); +//// list.add(attr); +// list.add(new Attribute(keys[i], vals[i], Attributes.this)); +// } +// return Collections.unmodifiableList(list); +// } // /** // * Retrieves a filtered view of attributes that are HTML5 custom data attributes; that is, attributes with keys diff --git a/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/jsoup/parser/CharacterReader.java b/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/jsoup/parser/CharacterReader.java index 0ae673a9..f7b7d892 100644 --- a/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/jsoup/parser/CharacterReader.java +++ b/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/jsoup/parser/CharacterReader.java @@ -1,5 +1,7 @@ package ru.noties.markwon.html.impl.jsoup.parser; +import android.support.annotation.NonNull; + import java.io.IOException; import java.io.Reader; import java.io.StringReader; @@ -10,12 +12,12 @@ import ru.noties.markwon.html.impl.jsoup.UncheckedIOException; import ru.noties.markwon.html.impl.jsoup.helper.Validate; /** - CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes. + * CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes. */ public final class CharacterReader { static final char EOF = (char) -1; private static final int maxStringCacheLen = 12; - static final int maxBufferLen = 1024 * 32; // visible for testing + static final int maxBufferLen = 1024 * 4; // visible for testing private static final int readAheadLimit = (int) (maxBufferLen * 0.75); private final char[] charBuf; @@ -25,13 +27,13 @@ public final class CharacterReader { private int bufPos; private int readerPos; private int bufMark; - private final String[] stringCache = new String[512]; // holds reused strings in this doc, to lessen garbage + private final String[] stringCache = new String[128]; // holds reused strings in this doc, to lessen garbage public CharacterReader(Reader input, int sz) { Validate.notNull(input); Validate.isTrue(input.markSupported()); reader = input; - charBuf = new char[sz > maxBufferLen ? maxBufferLen : sz]; + charBuf = new char[maxBufferLen]; bufferUp(); } @@ -43,6 +45,15 @@ public final class CharacterReader { this(new StringReader(input), input.length()); } +// public void swapInput(@NonNull String input) { +// reader = new StringReader(input); +// bufLength = 0; +// bufSplitPoint = 0; +// bufPos = 0; +// readerPos = 0; +// bufferUp(); +// } + private void bufferUp() { if (bufPos < bufSplitPoint) return; @@ -66,6 +77,7 @@ public final class CharacterReader { /** * Gets the current cursor position in the content. + * * @return current position */ public int pos() { @@ -74,6 +86,7 @@ public final class CharacterReader { /** * Tests if all the content has been read. + * * @return true if nothing left to read. */ public boolean isEmpty() { @@ -87,6 +100,7 @@ public final class CharacterReader { /** * Get the char at the current position. + * * @return char */ public char current() { @@ -122,6 +136,7 @@ public final class CharacterReader { /** * Returns the number of characters between the current position and the next instance of the input char + * * @param c scan target * @return offset between current position and next instance of target. -1 if not found. */ @@ -148,9 +163,9 @@ public final class CharacterReader { for (int offset = bufPos; offset < bufLength; offset++) { // scan to first instance of startchar: if (startChar != charBuf[offset]) - while(++offset < bufLength && startChar != charBuf[offset]) { /* empty */ } + while (++offset < bufLength && startChar != charBuf[offset]) { /* empty */ } int i = offset + 1; - int last = i + seq.length()-1; + int last = i + seq.length() - 1; if (offset < bufLength && last <= bufLength) { for (int j = 1; i < last && seq.charAt(j) == charBuf[i]; i++, j++) { /* empty */ } if (i == last) // found full sequence @@ -162,6 +177,7 @@ public final class CharacterReader { /** * Reads characters up to the specific char. + * * @param c the delimiter * @return the chars read */ @@ -189,6 +205,7 @@ public final class CharacterReader { /** * Read characters until the first of any delimiters is found. + * * @param chars delimiters to scan for * @return characters read up to the matched delimiter. */ @@ -198,7 +215,8 @@ public final class CharacterReader { final int remaining = bufLength; final char[] val = charBuf; - OUTER: while (bufPos < remaining) { + OUTER: + while (bufPos < remaining) { for (char c : chars) { if (val[bufPos] == c) break OUTER; @@ -206,7 +224,7 @@ public final class CharacterReader { bufPos++; } - return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : ""; + return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos - start) : ""; } String consumeToAnySorted(final char... chars) { @@ -221,7 +239,7 @@ public final class CharacterReader { bufPos++; } - return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : ""; + return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos - start) : ""; } String consumeData() { @@ -233,12 +251,12 @@ public final class CharacterReader { while (bufPos < remaining) { final char c = val[bufPos]; - if (c == '&'|| c == '<' || c == TokeniserState.nullChar) + if (c == '&' || c == '<' || c == TokeniserState.nullChar) break; bufPos++; } - return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : ""; + return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos - start) : ""; } String consumeTagName() { @@ -250,12 +268,12 @@ public final class CharacterReader { while (bufPos < remaining) { final char c = val[bufPos]; - if (c == '\t'|| c == '\n'|| c == '\r'|| c == '\f'|| c == ' '|| c == '/'|| c == '>'|| c == TokeniserState.nullChar) + if (c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == ' ' || c == '/' || c == '>' || c == TokeniserState.nullChar) break; bufPos++; } - return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : ""; + return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos - start) : ""; } String consumeToEnd() { @@ -338,7 +356,7 @@ public final class CharacterReader { return false; for (int offset = 0; offset < scanLength; offset++) - if (seq.charAt(offset) != charBuf[bufPos +offset]) + if (seq.charAt(offset) != charBuf[bufPos + offset]) return false; return true; } @@ -423,7 +441,7 @@ public final class CharacterReader { /** * Caches short strings, as a flywheel pattern, to reduce GC load. Just for this doc, to prevent leaks. - *

+ *

* Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list. * That saves both having to create objects as hash keys, and running through the entry list, at the expense of * some more duplicates. diff --git a/markwon-html-parser-impl/src/test/java/ru/noties/markwon/html/impl/HtmlEmptyTagReplacementTest.java b/markwon-html-parser-impl/src/test/java/ru/noties/markwon/html/impl/HtmlEmptyTagReplacementTest.java index b4ca8996..eab3a9f1 100644 --- a/markwon-html-parser-impl/src/test/java/ru/noties/markwon/html/impl/HtmlEmptyTagReplacementTest.java +++ b/markwon-html-parser-impl/src/test/java/ru/noties/markwon/html/impl/HtmlEmptyTagReplacementTest.java @@ -3,8 +3,10 @@ package ru.noties.markwon.html.impl; import org.junit.Before; import org.junit.Test; -import ru.noties.markwon.html.impl.jsoup.nodes.Attributes; -import ru.noties.markwon.html.impl.jsoup.parser.Token; +import java.util.Collections; + +import ru.noties.markwon.html.api.HtmlTag; +import ru.noties.markwon.html.impl.HtmlTagImpl.InlineImpl; import static org.junit.Assert.assertEquals; @@ -19,24 +21,27 @@ public class HtmlEmptyTagReplacementTest { @Test public void imageReplacementNoAlt() { - final Token.StartTag startTag = new Token.StartTag(); - startTag.normalName = "img"; - assertEquals("\uFFFC", replacement.replace(startTag)); + final HtmlTag.Inline img = new InlineImpl("img", -1, Collections.emptyMap()); + assertEquals("\uFFFC", replacement.replace(img)); } @Test public void imageReplacementAlt() { - final Token.StartTag startTag = new Token.StartTag(); - startTag.normalName = "img"; - startTag.attributes = new Attributes().put("alt", "alternative27"); - assertEquals("alternative27", replacement.replace(startTag)); + final HtmlTag.Inline img = new InlineImpl( + "img", + -1, + Collections.singletonMap("alt", "alternative27") + ); + assertEquals("alternative27", replacement.replace(img)); } @Test public void brAddsNewLine() { - final Token.StartTag startTag = new Token.StartTag(); - startTag.normalName = "br"; - startTag.selfClosing = true; - assertEquals("\n", replacement.replace(startTag)); + final HtmlTag.Inline br = new InlineImpl( + "br", + -1, + Collections.emptyMap() + ); + assertEquals("\n", replacement.replace(br)); } } \ No newline at end of file diff --git a/markwon-html-parser-impl/src/test/java/ru/noties/markwon/html/impl/MarkwonHtmlParserImplTest.java b/markwon-html-parser-impl/src/test/java/ru/noties/markwon/html/impl/MarkwonHtmlParserImplTest.java index 1dd20668..3edcfeb1 100644 --- a/markwon-html-parser-impl/src/test/java/ru/noties/markwon/html/impl/MarkwonHtmlParserImplTest.java +++ b/markwon-html-parser-impl/src/test/java/ru/noties/markwon/html/impl/MarkwonHtmlParserImplTest.java @@ -17,7 +17,6 @@ import java.util.Set; import ru.noties.markwon.html.api.HtmlTag; import ru.noties.markwon.html.api.MarkwonHtmlParser; -import ru.noties.markwon.html.impl.jsoup.parser.Token; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -36,8 +35,8 @@ public class MarkwonHtmlParserImplTest { final MarkwonHtmlParserImpl impl = MarkwonHtmlParserImpl.create(new HtmlEmptyTagReplacement() { @Nullable @Override - public String replace(@NonNull Token.StartTag startTag) { - return startTag.normalName; + public String replace(@NonNull HtmlTag tag) { + return tag.name(); } }); @@ -98,7 +97,7 @@ public class MarkwonHtmlParserImplTest { final MarkwonHtmlParserImpl impl = MarkwonHtmlParserImpl.create(new HtmlEmptyTagReplacement() { @Nullable @Override - public String replace(@NonNull Token.StartTag startTag) { + public String replace(@NonNull HtmlTag tag) { return null; } }); @@ -143,7 +142,7 @@ public class MarkwonHtmlParserImplTest { final MarkwonHtmlParserImpl impl = MarkwonHtmlParserImpl.create(new HtmlEmptyTagReplacement() { @Nullable @Override - public String replace(@NonNull Token.StartTag startTag) { + public String replace(@NonNull HtmlTag tag) { return null; } }); @@ -212,7 +211,7 @@ public class MarkwonHtmlParserImplTest { final MarkwonHtmlParserImpl impl = MarkwonHtmlParserImpl.create(new HtmlEmptyTagReplacement() { @Nullable @Override - public String replace(@NonNull Token.StartTag startTag) { + public String replace(@NonNull HtmlTag tag) { return null; } }); @@ -278,10 +277,9 @@ public class MarkwonHtmlParserImplTest { ); final MarkwonHtmlParserImpl impl = MarkwonHtmlParserImpl.create(new HtmlEmptyTagReplacement() { - @Nullable @Override - public String replace(@NonNull Token.StartTag startTag) { - return startTag.normalName; + public String replace(@NonNull HtmlTag tag) { + return tag.name(); } }); @@ -473,8 +471,6 @@ public class MarkwonHtmlParserImplTest { final StringBuilder output = new StringBuilder(); impl.processFragment(output, "

italic emphasis italic
"); - System.out.printf("output: `%s`%n", output); - final CaptureInlineTagsAction inlineTagsAction = new CaptureInlineTagsAction(); final CaptureBlockTagsAction blockTagsAction = new CaptureBlockTagsAction(); @@ -804,6 +800,39 @@ public class MarkwonHtmlParserImplTest { assertEquals(Arrays.toString(split), 5, split.length); } + @Test + public void attributesAreLowerCase() { + + final MarkwonHtmlParserImpl impl = MarkwonHtmlParserImpl.create(); + final StringBuilder output = new StringBuilder(); + + impl.processFragment(output, ""); + + final CaptureInlineTagsAction action = new CaptureInlineTagsAction(); + impl.flushInlineTags(output.length(), action); + + assertTrue(action.called); + assertEquals(1, action.tags.size()); + + with(action.tags.get(0), new Action() { + @Override + public void apply(@NonNull HtmlTag.Inline inline) { + + assertEquals("i", inline.name()); + + with(inline.attributes(), new Action>() { + @Override + public void apply(@NonNull Map map) { + assertEquals(3, map.size()); + assertEquals("my-class", map.get("class")); + assertEquals("", map.get("disabled")); + assertEquals("there", map.get("@hello")); + } + }); + } + }); + } + private static class CaptureTagsAction implements MarkwonHtmlParser.FlushAction { boolean called; diff --git a/markwon/src/main/java/ru/noties/markwon/SpannableBuilder.java b/markwon/src/main/java/ru/noties/markwon/SpannableBuilder.java index 154ce8f6..a02b53de 100644 --- a/markwon/src/main/java/ru/noties/markwon/SpannableBuilder.java +++ b/markwon/src/main/java/ru/noties/markwon/SpannableBuilder.java @@ -22,13 +22,12 @@ import java.util.Iterator; @SuppressWarnings({"WeakerAccess", "unused"}) public class SpannableBuilder implements Appendable, CharSequence { - + /** + * @since 2.0.0 + */ public static void setSpans(@NonNull SpannableBuilder builder, @Nullable Object spans, int start, int end) { if (spans != null) { - // let's filter non-valid positions here, so there is no need to validate - // it whilst applying non-closed html tags - // // setting a span for an invalid position can lead to silent fail (no exception, // but execution is stopped) if (!isPositionValid(builder.length(), start, end)) { diff --git a/markwon/src/main/java/ru/noties/markwon/renderer/html2/tag/TagHandler.java b/markwon/src/main/java/ru/noties/markwon/renderer/html2/tag/TagHandler.java index dabc719d..818c4a98 100644 --- a/markwon/src/main/java/ru/noties/markwon/renderer/html2/tag/TagHandler.java +++ b/markwon/src/main/java/ru/noties/markwon/renderer/html2/tag/TagHandler.java @@ -22,6 +22,11 @@ public abstract class TagHandler { TagHandler handler; for (HtmlTag.Block child : block.children()) { + + if (!child.isClosed()) { + continue; + } + handler = configuration.htmlRenderer().tagHandler(child.name()); if (handler != null) { handler.handle(configuration, builder, child);