Improve empty tag replacement

2018-08-19 18:46:55 +03:00 · 2018-08-19 18:46:55 +03:00 · c7c998db8f
commit c7c998db8f
parent 5c9ba0f252
9 changed files with 163 additions and 69 deletions
--- a/markwon-html-parser-api/src/main/java/ru/noties/markwon/html/api/MarkwonHtmlParserNoOp.java
+++ b/markwon-html-parser-api/src/main/java/ru/noties/markwon/html/api/MarkwonHtmlParserNoOp.java
@ -2,6 +2,10 @@ package ru.noties.markwon.html.api;

 import android.support.annotation.NonNull;

+/**
+ * @see MarkwonHtmlParser
+ * @since 2.0.0
+ */
 class MarkwonHtmlParserNoOp extends MarkwonHtmlParser {

    @Override
--- a/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/HtmlEmptyTagReplacement.java
+++ b/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/HtmlEmptyTagReplacement.java
@ -3,14 +3,15 @@ package ru.noties.markwon.html.impl;
 import android.support.annotation.NonNull;
 import android.support.annotation.Nullable;

-import ru.noties.markwon.html.impl.jsoup.parser.Token;
+import ru.noties.markwon.html.api.HtmlTag;

 /**
 * This class will be used to append some text to output in order to
 * apply a Span for this tag. Please note that this class will be used for
 * _void_ tags and tags that are self-closed (even if HTML spec doesn\'t specify
 * a tag as self-closed). This is due to the fact that underlying parser does not
- * validate context and does not check if a tag is correctly used.
+ * validate context and does not check if a tag is correctly used. Plus it will be
+ * used for tags without content, for example: {@code <my-custom-element></my-custom-element>}
 *
 * @since 2.0.0
 */
@ -28,15 +29,16 @@ public class HtmlEmptyTagReplacement {
     * lead to `Inline` tag have start &amp; end the same value, thus not applicable for applying a Span)
     */
    @Nullable
-    public String replace(@NonNull Token.StartTag startTag) {
+    public String replace(@NonNull HtmlTag tag) {

        final String replacement;

-        final String name = startTag.normalName;
+        final String name = tag.name();
+
        if ("br".equals(name)) {
            replacement = "\n";
        } else if ("img".equals(name)) {
-            final String alt = startTag.attributes.getIgnoreCase("alt");
+            final String alt = tag.attributes().get("alt");
            if (alt == null
                    || alt.length() == 0) {
                // no alt is provided
@ -50,5 +52,4 @@ public class HtmlEmptyTagReplacement {

        return replacement;
    }
-
 }
--- a/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/MarkwonHtmlParserImpl.java
+++ b/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/MarkwonHtmlParserImpl.java
@ -119,6 +119,11 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {

    private boolean isInsidePreTag;

+    private Tokeniser tokeniser;
+
+    private CharacterReader reader;
+
+
    MarkwonHtmlParserImpl(
            @NonNull HtmlEmptyTagReplacement replacement,
            @NonNull TrimmingAppender trimmingAppender) {
@ -126,13 +131,14 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
        this.trimmingAppender = trimmingAppender;
    }

-
    @Override
    public <T extends Appendable & CharSequence> void processFragment(
            @NonNull T output,
            @NonNull String htmlFragment) {

-        // todo: maybe there is a way to reuse tokeniser...
+        // we might want to reuse tokeniser (at least when the same output is involved)
+        // as CharacterReader does a bit of initialization (cache etc) as it's
+        // primary usage is parsing a document in one run (not parsing _fragments_)
        final Tokeniser tokeniser = new Tokeniser(new CharacterReader(htmlFragment), ParseErrorList.noTracking());

        while (true) {
@ -239,7 +245,7 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
        if (isVoidTag(name)
                || startTag.selfClosing) {

-            final String replacement = emptyTagReplacement.replace(startTag);
+            final String replacement = emptyTagReplacement.replace(inline);
            if (replacement != null
                    && replacement.length() > 0) {
                appendQuietly(output, replacement);
@ -261,6 +267,12 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
        // try to find it, if none found -> ignore
        final HtmlTagImpl.InlineImpl openInline = findOpenInlineTag(endTag.normalName);
        if (openInline != null) {
+
+            // okay, if this tag is empty -> call replacement
+            if (isEmpty(output, openInline)) {
+                appendEmptyTagReplacement(output, openInline);
+            }
+
            // close open inline tag
            openInline.closeAt(output.length());
        }
@ -301,7 +313,7 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {

        final boolean isVoid = isVoidTag(name) || startTag.selfClosing;
        if (isVoid) {
-            final String replacement = emptyTagReplacement.replace(startTag);
+            final String replacement = emptyTagReplacement.replace(block);
            if (replacement != null
                    && replacement.length() > 0) {
                appendQuietly(output, replacement);
@ -331,6 +343,11 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
                isInsidePreTag = false;
            }

+            // okay, if this tag is empty -> call replacement
+            if (isEmpty(output, block)) {
+                appendEmptyTagReplacement(output, block);
+            }
+
            block.closeAt(output.length());

            if (TAG_PARAGRAPH.equals(name)) {
@ -434,4 +451,19 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {

        return map;
    }
+
+    protected static <T extends Appendable & CharSequence> boolean isEmpty(
+            @NonNull T output,
+            @NonNull HtmlTagImpl tag) {
+        return tag.start == output.length();
+    }
+
+    protected <T extends Appendable & CharSequence> void appendEmptyTagReplacement(
+            @NonNull T output,
+            @NonNull HtmlTagImpl tag) {
+        final String replacement = emptyTagReplacement.replace(tag);
+        if (replacement != null) {
+            appendQuietly(output, replacement);
+        }
+    }
 }
--- a/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/jsoup/nodes/Attributes.java
+++ b/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/jsoup/nodes/Attributes.java
@ -250,7 +250,8 @@ public class Attributes implements Iterable<Attribute>, Cloneable {

            @Override
            public Attribute next() {
-                final Attribute attr = new Attribute(keys[i], vals[i], Attributes.this);
+                final String val = vals[i];
+                final Attribute attr = new Attribute(keys[i], val == null ? "" : val, Attributes.this);
                i++;
                return attr;
            }
@ -262,21 +263,21 @@ public class Attributes implements Iterable<Attribute>, Cloneable {
        };
    }

-    /**
-     Get the attributes as a List, for iteration.
-     @return an view of the attributes as an unmodifialbe List.
-     */
-    public List<Attribute> asList() {
-        ArrayList<Attribute> list = new ArrayList<>(size);
-        for (int i = 0; i < size; i++) {
-//            Attribute attr = vals[i] == null ?
-//                    new BooleanAttribute(keys[i]) : // deprecated class, but maybe someone still wants it
-//                    new Attribute(keys[i], vals[i], Attributes.this);
-//            list.add(attr);
-            list.add(new Attribute(keys[i], vals[i], Attributes.this));
-        }
-        return Collections.unmodifiableList(list);
-    }
+//    /**
+//     Get the attributes as a List, for iteration.
+//     @return an view of the attributes as an unmodifialbe List.
+//     */
+//    public List<Attribute> asList() {
+//        ArrayList<Attribute> list = new ArrayList<>(size);
+//        for (int i = 0; i < size; i++) {
+////            Attribute attr = vals[i] == null ?
+////                    new BooleanAttribute(keys[i]) : // deprecated class, but maybe someone still wants it
+////                    new Attribute(keys[i], vals[i], Attributes.this);
+////            list.add(attr);
+//            list.add(new Attribute(keys[i], vals[i], Attributes.this));
+//        }
+//        return Collections.unmodifiableList(list);
+//    }

 //    /**
 //     * Retrieves a filtered view of attributes that are HTML5 custom data attributes; that is, attributes with keys
--- a/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/jsoup/parser/CharacterReader.java
+++ b/markwon-html-parser-impl/src/main/java/ru/noties/markwon/html/impl/jsoup/parser/CharacterReader.java
@ -1,5 +1,7 @@
 package ru.noties.markwon.html.impl.jsoup.parser;

+import android.support.annotation.NonNull;
+
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
@ -10,12 +12,12 @@ import ru.noties.markwon.html.impl.jsoup.UncheckedIOException;
 import ru.noties.markwon.html.impl.jsoup.helper.Validate;

 /**
- CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes.
+ * CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes.
 */
 public final class CharacterReader {
    static final char EOF = (char) -1;
    private static final int maxStringCacheLen = 12;
-    static final int maxBufferLen = 1024 * 32; // visible for testing
+    static final int maxBufferLen = 1024 * 4; // visible for testing
    private static final int readAheadLimit = (int) (maxBufferLen * 0.75);

    private final char[] charBuf;
@ -25,13 +27,13 @@ public final class CharacterReader {
    private int bufPos;
    private int readerPos;
    private int bufMark;
-    private final String[] stringCache = new String[512]; // holds reused strings in this doc, to lessen garbage
+    private final String[] stringCache = new String[128]; // holds reused strings in this doc, to lessen garbage

    public CharacterReader(Reader input, int sz) {
        Validate.notNull(input);
        Validate.isTrue(input.markSupported());
        reader = input;
-        charBuf = new char[sz > maxBufferLen ? maxBufferLen : sz];
+        charBuf = new char[maxBufferLen];
        bufferUp();
    }

@ -43,6 +45,15 @@ public final class CharacterReader {
        this(new StringReader(input), input.length());
    }

+//    public void swapInput(@NonNull String input) {
+//        reader = new StringReader(input);
+//        bufLength = 0;
+//        bufSplitPoint = 0;
+//        bufPos = 0;
+//        readerPos = 0;
+//        bufferUp();
+//    }
+
    private void bufferUp() {
        if (bufPos < bufSplitPoint)
            return;
@ -66,6 +77,7 @@ public final class CharacterReader {

    /**
     * Gets the current cursor position in the content.
+     *
     * @return current position
     */
    public int pos() {
@ -74,6 +86,7 @@ public final class CharacterReader {

    /**
     * Tests if all the content has been read.
+     *
     * @return true if nothing left to read.
     */
    public boolean isEmpty() {
@ -87,6 +100,7 @@ public final class CharacterReader {

    /**
     * Get the char at the current position.
+     *
     * @return char
     */
    public char current() {
@ -122,6 +136,7 @@ public final class CharacterReader {

    /**
     * Returns the number of characters between the current position and the next instance of the input char
+     *
     * @param c scan target
     * @return offset between current position and next instance of target. -1 if not found.
     */
@ -148,9 +163,9 @@ public final class CharacterReader {
        for (int offset = bufPos; offset < bufLength; offset++) {
            // scan to first instance of startchar:
            if (startChar != charBuf[offset])
-                while(++offset < bufLength && startChar != charBuf[offset]) { /* empty */ }
+                while (++offset < bufLength && startChar != charBuf[offset]) { /* empty */ }
            int i = offset + 1;
-            int last = i + seq.length()-1;
+            int last = i + seq.length() - 1;
            if (offset < bufLength && last <= bufLength) {
                for (int j = 1; i < last && seq.charAt(j) == charBuf[i]; i++, j++) { /* empty */ }
                if (i == last) // found full sequence
@ -162,6 +177,7 @@ public final class CharacterReader {

    /**
     * Reads characters up to the specific char.
+     *
     * @param c the delimiter
     * @return the chars read
     */
@ -189,6 +205,7 @@ public final class CharacterReader {

    /**
     * Read characters until the first of any delimiters is found.
+     *
     * @param chars delimiters to scan for
     * @return characters read up to the matched delimiter.
     */
@ -198,7 +215,8 @@ public final class CharacterReader {
        final int remaining = bufLength;
        final char[] val = charBuf;

-        OUTER: while (bufPos < remaining) {
+        OUTER:
+        while (bufPos < remaining) {
            for (char c : chars) {
                if (val[bufPos] == c)
                    break OUTER;
@ -206,7 +224,7 @@ public final class CharacterReader {
            bufPos++;
        }

-        return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
+        return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos - start) : "";
    }

    String consumeToAnySorted(final char... chars) {
@ -221,7 +239,7 @@ public final class CharacterReader {
            bufPos++;
        }

-        return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
+        return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos - start) : "";
    }

    String consumeData() {
@ -233,12 +251,12 @@ public final class CharacterReader {

        while (bufPos < remaining) {
            final char c = val[bufPos];
-            if (c == '&'|| c ==  '<' || c ==  TokeniserState.nullChar)
+            if (c == '&' || c == '<' || c == TokeniserState.nullChar)
                break;
            bufPos++;
        }

-        return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
+        return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos - start) : "";
    }

    String consumeTagName() {
@ -250,12 +268,12 @@ public final class CharacterReader {

        while (bufPos < remaining) {
            final char c = val[bufPos];
-            if (c == '\t'|| c ==  '\n'|| c ==  '\r'|| c ==  '\f'|| c ==  ' '|| c ==  '/'|| c ==  '>'|| c ==  TokeniserState.nullChar)
+            if (c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == ' ' || c == '/' || c == '>' || c == TokeniserState.nullChar)
                break;
            bufPos++;
        }

-        return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
+        return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos - start) : "";
    }

    String consumeToEnd() {
@ -338,7 +356,7 @@ public final class CharacterReader {
            return false;

        for (int offset = 0; offset < scanLength; offset++)
-            if (seq.charAt(offset) != charBuf[bufPos +offset])
+            if (seq.charAt(offset) != charBuf[bufPos + offset])
                return false;
        return true;
    }
@ -423,7 +441,7 @@ public final class CharacterReader {

    /**
     * Caches short strings, as a flywheel pattern, to reduce GC load. Just for this doc, to prevent leaks.
-     * <p />
+     * <p/>
     * Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list.
     * That saves both having to create objects as hash keys, and running through the entry list, at the expense of
     * some more duplicates.
--- a/markwon-html-parser-impl/src/test/java/ru/noties/markwon/html/impl/HtmlEmptyTagReplacementTest.java
+++ b/markwon-html-parser-impl/src/test/java/ru/noties/markwon/html/impl/HtmlEmptyTagReplacementTest.java
@ -3,8 +3,10 @@ package ru.noties.markwon.html.impl;
 import org.junit.Before;
 import org.junit.Test;

-import ru.noties.markwon.html.impl.jsoup.nodes.Attributes;
-import ru.noties.markwon.html.impl.jsoup.parser.Token;
+import java.util.Collections;
+
+import ru.noties.markwon.html.api.HtmlTag;
+import ru.noties.markwon.html.impl.HtmlTagImpl.InlineImpl;

 import static org.junit.Assert.assertEquals;

@ -19,24 +21,27 @@ public class HtmlEmptyTagReplacementTest {

    @Test
    public void imageReplacementNoAlt() {
-        final Token.StartTag startTag = new Token.StartTag();
-        startTag.normalName = "img";
-        assertEquals("\uFFFC", replacement.replace(startTag));
+        final HtmlTag.Inline img = new InlineImpl("img", -1, Collections.<String, String>emptyMap());
+        assertEquals("\uFFFC", replacement.replace(img));
    }

    @Test
    public void imageReplacementAlt() {
-        final Token.StartTag startTag = new Token.StartTag();
-        startTag.normalName = "img";
-        startTag.attributes = new Attributes().put("alt", "alternative27");
-        assertEquals("alternative27", replacement.replace(startTag));
+        final HtmlTag.Inline img = new InlineImpl(
+                "img",
+                -1,
+                Collections.singletonMap("alt", "alternative27")
+        );
+        assertEquals("alternative27", replacement.replace(img));
    }

    @Test
    public void brAddsNewLine() {
-        final Token.StartTag startTag = new Token.StartTag();
-        startTag.normalName = "br";
-        startTag.selfClosing = true;
-        assertEquals("\n", replacement.replace(startTag));
+        final HtmlTag.Inline br = new InlineImpl(
+                "br",
+                -1,
+                Collections.<String, String>emptyMap()
+        );
+        assertEquals("\n", replacement.replace(br));
    }
 }
--- a/markwon-html-parser-impl/src/test/java/ru/noties/markwon/html/impl/MarkwonHtmlParserImplTest.java
+++ b/markwon-html-parser-impl/src/test/java/ru/noties/markwon/html/impl/MarkwonHtmlParserImplTest.java
@ -17,7 +17,6 @@ import java.util.Set;

 import ru.noties.markwon.html.api.HtmlTag;
 import ru.noties.markwon.html.api.MarkwonHtmlParser;
-import ru.noties.markwon.html.impl.jsoup.parser.Token;

 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
@ -36,8 +35,8 @@ public class MarkwonHtmlParserImplTest {
        final MarkwonHtmlParserImpl impl = MarkwonHtmlParserImpl.create(new HtmlEmptyTagReplacement() {
            @Nullable
            @Override
-            public String replace(@NonNull Token.StartTag startTag) {
-                return startTag.normalName;
+            public String replace(@NonNull HtmlTag tag) {
+                return tag.name();
            }
        });

@ -98,7 +97,7 @@ public class MarkwonHtmlParserImplTest {
        final MarkwonHtmlParserImpl impl = MarkwonHtmlParserImpl.create(new HtmlEmptyTagReplacement() {
            @Nullable
            @Override
-            public String replace(@NonNull Token.StartTag startTag) {
+            public String replace(@NonNull HtmlTag tag) {
                return null;
            }
        });
@ -143,7 +142,7 @@ public class MarkwonHtmlParserImplTest {
        final MarkwonHtmlParserImpl impl = MarkwonHtmlParserImpl.create(new HtmlEmptyTagReplacement() {
            @Nullable
            @Override
-            public String replace(@NonNull Token.StartTag startTag) {
+            public String replace(@NonNull HtmlTag tag) {
                return null;
            }
        });
@ -212,7 +211,7 @@ public class MarkwonHtmlParserImplTest {
        final MarkwonHtmlParserImpl impl = MarkwonHtmlParserImpl.create(new HtmlEmptyTagReplacement() {
            @Nullable
            @Override
-            public String replace(@NonNull Token.StartTag startTag) {
+            public String replace(@NonNull HtmlTag tag) {
                return null;
            }
        });
@ -278,10 +277,9 @@ public class MarkwonHtmlParserImplTest {
        );

        final MarkwonHtmlParserImpl impl = MarkwonHtmlParserImpl.create(new HtmlEmptyTagReplacement() {
-            @Nullable
            @Override
-            public String replace(@NonNull Token.StartTag startTag) {
-                return startTag.normalName;
+            public String replace(@NonNull HtmlTag tag) {
+                return tag.name();
            }
        });

@ -473,8 +471,6 @@ public class MarkwonHtmlParserImplTest {
        final StringBuilder output = new StringBuilder();
        impl.processFragment(output, "<DiV><I>italic <eM>emphasis</Em> italic</i></dIv>");

-        System.out.printf("output: `%s`%n", output);
-
        final CaptureInlineTagsAction inlineTagsAction = new CaptureInlineTagsAction();
        final CaptureBlockTagsAction blockTagsAction = new CaptureBlockTagsAction();

@ -804,6 +800,39 @@ public class MarkwonHtmlParserImplTest {
        assertEquals(Arrays.toString(split), 5, split.length);
    }

+    @Test
+    public void attributesAreLowerCase() {
+
+        final MarkwonHtmlParserImpl impl = MarkwonHtmlParserImpl.create();
+        final StringBuilder output = new StringBuilder();
+
+        impl.processFragment(output, "<i CLASS=\"my-class\" dIsAbLeD @HeLLo=\"there\">");
+
+        final CaptureInlineTagsAction action = new CaptureInlineTagsAction();
+        impl.flushInlineTags(output.length(), action);
+
+        assertTrue(action.called);
+        assertEquals(1, action.tags.size());
+
+        with(action.tags.get(0), new Action<HtmlTag.Inline>() {
+            @Override
+            public void apply(@NonNull HtmlTag.Inline inline) {
+
+                assertEquals("i", inline.name());
+
+                with(inline.attributes(), new Action<Map<String, String>>() {
+                    @Override
+                    public void apply(@NonNull Map<String, String> map) {
+                        assertEquals(3, map.size());
+                        assertEquals("my-class", map.get("class"));
+                        assertEquals("", map.get("disabled"));
+                        assertEquals("there", map.get("@hello"));
+                    }
+                });
+            }
+        });
+    }
+
    private static class CaptureTagsAction<T> implements MarkwonHtmlParser.FlushAction<T> {

        boolean called;
--- a/markwon/src/main/java/ru/noties/markwon/SpannableBuilder.java
+++ b/markwon/src/main/java/ru/noties/markwon/SpannableBuilder.java
@ -22,13 +22,12 @@ import java.util.Iterator;
@SuppressWarnings({"WeakerAccess", "unused"})
 public class SpannableBuilder implements Appendable, CharSequence {

-
+    /**
+     * @since 2.0.0
+     */
    public static void setSpans(@NonNull SpannableBuilder builder, @Nullable Object spans, int start, int end) {
        if (spans != null) {

-            // let's filter non-valid positions here, so there is no need to validate
-            // it whilst applying non-closed html tags
-            //
            // setting a span for an invalid position can lead to silent fail (no exception,
            // but execution is stopped)
            if (!isPositionValid(builder.length(), start, end)) {
--- a/markwon/src/main/java/ru/noties/markwon/renderer/html2/tag/TagHandler.java
+++ b/markwon/src/main/java/ru/noties/markwon/renderer/html2/tag/TagHandler.java
@ -22,6 +22,11 @@ public abstract class TagHandler {
        TagHandler handler;

        for (HtmlTag.Block child : block.children()) {
+
+            if (!child.isClosed()) {
+                continue;
+            }
+
            handler = configuration.htmlRenderer().tagHandler(child.name());
            if (handler != null) {
                handler.handle(configuration, builder, child);