Improve empty tag replacement

This commit is contained in:
Dimitry Ivanov 2018-08-19 18:46:55 +03:00
parent 5c9ba0f252
commit c7c998db8f
9 changed files with 163 additions and 69 deletions

View File

@ -2,6 +2,10 @@ package ru.noties.markwon.html.api;
import android.support.annotation.NonNull;
/**
* @see MarkwonHtmlParser
* @since 2.0.0
*/
class MarkwonHtmlParserNoOp extends MarkwonHtmlParser {
@Override

View File

@ -3,14 +3,15 @@ package ru.noties.markwon.html.impl;
import android.support.annotation.NonNull;
import android.support.annotation.Nullable;
import ru.noties.markwon.html.impl.jsoup.parser.Token;
import ru.noties.markwon.html.api.HtmlTag;
/**
* This class will be used to append some text to output in order to
* apply a Span for this tag. Please note that this class will be used for
* _void_ tags and tags that are self-closed (even if HTML spec doesn\'t specify
* a tag as self-closed). This is due to the fact that underlying parser does not
* validate context and does not check if a tag is correctly used.
* validate context and does not check if a tag is correctly used. Plus it will be
* used for tags without content, for example: {@code <my-custom-element></my-custom-element>}
*
* @since 2.0.0
*/
@ -28,15 +29,16 @@ public class HtmlEmptyTagReplacement {
* lead to `Inline` tag have start &amp; end the same value, thus not applicable for applying a Span)
*/
@Nullable
public String replace(@NonNull Token.StartTag startTag) {
public String replace(@NonNull HtmlTag tag) {
final String replacement;
final String name = startTag.normalName;
final String name = tag.name();
if ("br".equals(name)) {
replacement = "\n";
} else if ("img".equals(name)) {
final String alt = startTag.attributes.getIgnoreCase("alt");
final String alt = tag.attributes().get("alt");
if (alt == null
|| alt.length() == 0) {
// no alt is provided
@ -50,5 +52,4 @@ public class HtmlEmptyTagReplacement {
return replacement;
}
}

View File

@ -119,6 +119,11 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
private boolean isInsidePreTag;
private Tokeniser tokeniser;
private CharacterReader reader;
MarkwonHtmlParserImpl(
@NonNull HtmlEmptyTagReplacement replacement,
@NonNull TrimmingAppender trimmingAppender) {
@ -126,13 +131,14 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
this.trimmingAppender = trimmingAppender;
}
@Override
public <T extends Appendable & CharSequence> void processFragment(
@NonNull T output,
@NonNull String htmlFragment) {
// todo: maybe there is a way to reuse tokeniser...
// we might want to reuse tokeniser (at least when the same output is involved)
// as CharacterReader does a bit of initialization (cache etc) as it's
// primary usage is parsing a document in one run (not parsing _fragments_)
final Tokeniser tokeniser = new Tokeniser(new CharacterReader(htmlFragment), ParseErrorList.noTracking());
while (true) {
@ -239,7 +245,7 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
if (isVoidTag(name)
|| startTag.selfClosing) {
final String replacement = emptyTagReplacement.replace(startTag);
final String replacement = emptyTagReplacement.replace(inline);
if (replacement != null
&& replacement.length() > 0) {
appendQuietly(output, replacement);
@ -261,6 +267,12 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
// try to find it, if none found -> ignore
final HtmlTagImpl.InlineImpl openInline = findOpenInlineTag(endTag.normalName);
if (openInline != null) {
// okay, if this tag is empty -> call replacement
if (isEmpty(output, openInline)) {
appendEmptyTagReplacement(output, openInline);
}
// close open inline tag
openInline.closeAt(output.length());
}
@ -301,7 +313,7 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
final boolean isVoid = isVoidTag(name) || startTag.selfClosing;
if (isVoid) {
final String replacement = emptyTagReplacement.replace(startTag);
final String replacement = emptyTagReplacement.replace(block);
if (replacement != null
&& replacement.length() > 0) {
appendQuietly(output, replacement);
@ -331,6 +343,11 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
isInsidePreTag = false;
}
// okay, if this tag is empty -> call replacement
if (isEmpty(output, block)) {
appendEmptyTagReplacement(output, block);
}
block.closeAt(output.length());
if (TAG_PARAGRAPH.equals(name)) {
@ -434,4 +451,19 @@ public class MarkwonHtmlParserImpl extends MarkwonHtmlParser {
return map;
}
protected static <T extends Appendable & CharSequence> boolean isEmpty(
@NonNull T output,
@NonNull HtmlTagImpl tag) {
return tag.start == output.length();
}
protected <T extends Appendable & CharSequence> void appendEmptyTagReplacement(
@NonNull T output,
@NonNull HtmlTagImpl tag) {
final String replacement = emptyTagReplacement.replace(tag);
if (replacement != null) {
appendQuietly(output, replacement);
}
}
}

View File

@ -250,7 +250,8 @@ public class Attributes implements Iterable<Attribute>, Cloneable {
@Override
public Attribute next() {
final Attribute attr = new Attribute(keys[i], vals[i], Attributes.this);
final String val = vals[i];
final Attribute attr = new Attribute(keys[i], val == null ? "" : val, Attributes.this);
i++;
return attr;
}
@ -262,21 +263,21 @@ public class Attributes implements Iterable<Attribute>, Cloneable {
};
}
/**
Get the attributes as a List, for iteration.
@return an view of the attributes as an unmodifialbe List.
*/
public List<Attribute> asList() {
ArrayList<Attribute> list = new ArrayList<>(size);
for (int i = 0; i < size; i++) {
// Attribute attr = vals[i] == null ?
// new BooleanAttribute(keys[i]) : // deprecated class, but maybe someone still wants it
// new Attribute(keys[i], vals[i], Attributes.this);
// list.add(attr);
list.add(new Attribute(keys[i], vals[i], Attributes.this));
}
return Collections.unmodifiableList(list);
}
// /**
// Get the attributes as a List, for iteration.
// @return an view of the attributes as an unmodifialbe List.
// */
// public List<Attribute> asList() {
// ArrayList<Attribute> list = new ArrayList<>(size);
// for (int i = 0; i < size; i++) {
//// Attribute attr = vals[i] == null ?
//// new BooleanAttribute(keys[i]) : // deprecated class, but maybe someone still wants it
//// new Attribute(keys[i], vals[i], Attributes.this);
//// list.add(attr);
// list.add(new Attribute(keys[i], vals[i], Attributes.this));
// }
// return Collections.unmodifiableList(list);
// }
// /**
// * Retrieves a filtered view of attributes that are HTML5 custom data attributes; that is, attributes with keys

View File

@ -1,5 +1,7 @@
package ru.noties.markwon.html.impl.jsoup.parser;
import android.support.annotation.NonNull;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
@ -10,12 +12,12 @@ import ru.noties.markwon.html.impl.jsoup.UncheckedIOException;
import ru.noties.markwon.html.impl.jsoup.helper.Validate;
/**
CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes.
* CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes.
*/
public final class CharacterReader {
static final char EOF = (char) -1;
private static final int maxStringCacheLen = 12;
static final int maxBufferLen = 1024 * 32; // visible for testing
static final int maxBufferLen = 1024 * 4; // visible for testing
private static final int readAheadLimit = (int) (maxBufferLen * 0.75);
private final char[] charBuf;
@ -25,13 +27,13 @@ public final class CharacterReader {
private int bufPos;
private int readerPos;
private int bufMark;
private final String[] stringCache = new String[512]; // holds reused strings in this doc, to lessen garbage
private final String[] stringCache = new String[128]; // holds reused strings in this doc, to lessen garbage
public CharacterReader(Reader input, int sz) {
Validate.notNull(input);
Validate.isTrue(input.markSupported());
reader = input;
charBuf = new char[sz > maxBufferLen ? maxBufferLen : sz];
charBuf = new char[maxBufferLen];
bufferUp();
}
@ -43,6 +45,15 @@ public final class CharacterReader {
this(new StringReader(input), input.length());
}
// public void swapInput(@NonNull String input) {
// reader = new StringReader(input);
// bufLength = 0;
// bufSplitPoint = 0;
// bufPos = 0;
// readerPos = 0;
// bufferUp();
// }
private void bufferUp() {
if (bufPos < bufSplitPoint)
return;
@ -66,6 +77,7 @@ public final class CharacterReader {
/**
* Gets the current cursor position in the content.
*
* @return current position
*/
public int pos() {
@ -74,6 +86,7 @@ public final class CharacterReader {
/**
* Tests if all the content has been read.
*
* @return true if nothing left to read.
*/
public boolean isEmpty() {
@ -87,6 +100,7 @@ public final class CharacterReader {
/**
* Get the char at the current position.
*
* @return char
*/
public char current() {
@ -122,6 +136,7 @@ public final class CharacterReader {
/**
* Returns the number of characters between the current position and the next instance of the input char
*
* @param c scan target
* @return offset between current position and next instance of target. -1 if not found.
*/
@ -148,9 +163,9 @@ public final class CharacterReader {
for (int offset = bufPos; offset < bufLength; offset++) {
// scan to first instance of startchar:
if (startChar != charBuf[offset])
while(++offset < bufLength && startChar != charBuf[offset]) { /* empty */ }
while (++offset < bufLength && startChar != charBuf[offset]) { /* empty */ }
int i = offset + 1;
int last = i + seq.length()-1;
int last = i + seq.length() - 1;
if (offset < bufLength && last <= bufLength) {
for (int j = 1; i < last && seq.charAt(j) == charBuf[i]; i++, j++) { /* empty */ }
if (i == last) // found full sequence
@ -162,6 +177,7 @@ public final class CharacterReader {
/**
* Reads characters up to the specific char.
*
* @param c the delimiter
* @return the chars read
*/
@ -189,6 +205,7 @@ public final class CharacterReader {
/**
* Read characters until the first of any delimiters is found.
*
* @param chars delimiters to scan for
* @return characters read up to the matched delimiter.
*/
@ -198,7 +215,8 @@ public final class CharacterReader {
final int remaining = bufLength;
final char[] val = charBuf;
OUTER: while (bufPos < remaining) {
OUTER:
while (bufPos < remaining) {
for (char c : chars) {
if (val[bufPos] == c)
break OUTER;
@ -206,7 +224,7 @@ public final class CharacterReader {
bufPos++;
}
return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos - start) : "";
}
String consumeToAnySorted(final char... chars) {
@ -221,7 +239,7 @@ public final class CharacterReader {
bufPos++;
}
return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos - start) : "";
}
String consumeData() {
@ -233,12 +251,12 @@ public final class CharacterReader {
while (bufPos < remaining) {
final char c = val[bufPos];
if (c == '&'|| c == '<' || c == TokeniserState.nullChar)
if (c == '&' || c == '<' || c == TokeniserState.nullChar)
break;
bufPos++;
}
return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos - start) : "";
}
String consumeTagName() {
@ -250,12 +268,12 @@ public final class CharacterReader {
while (bufPos < remaining) {
final char c = val[bufPos];
if (c == '\t'|| c == '\n'|| c == '\r'|| c == '\f'|| c == ' '|| c == '/'|| c == '>'|| c == TokeniserState.nullChar)
if (c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == ' ' || c == '/' || c == '>' || c == TokeniserState.nullChar)
break;
bufPos++;
}
return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos -start) : "";
return bufPos > start ? cacheString(charBuf, stringCache, start, bufPos - start) : "";
}
String consumeToEnd() {
@ -338,7 +356,7 @@ public final class CharacterReader {
return false;
for (int offset = 0; offset < scanLength; offset++)
if (seq.charAt(offset) != charBuf[bufPos +offset])
if (seq.charAt(offset) != charBuf[bufPos + offset])
return false;
return true;
}
@ -423,7 +441,7 @@ public final class CharacterReader {
/**
* Caches short strings, as a flywheel pattern, to reduce GC load. Just for this doc, to prevent leaks.
* <p />
* <p/>
* Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list.
* That saves both having to create objects as hash keys, and running through the entry list, at the expense of
* some more duplicates.

View File

@ -3,8 +3,10 @@ package ru.noties.markwon.html.impl;
import org.junit.Before;
import org.junit.Test;
import ru.noties.markwon.html.impl.jsoup.nodes.Attributes;
import ru.noties.markwon.html.impl.jsoup.parser.Token;
import java.util.Collections;
import ru.noties.markwon.html.api.HtmlTag;
import ru.noties.markwon.html.impl.HtmlTagImpl.InlineImpl;
import static org.junit.Assert.assertEquals;
@ -19,24 +21,27 @@ public class HtmlEmptyTagReplacementTest {
@Test
public void imageReplacementNoAlt() {
final Token.StartTag startTag = new Token.StartTag();
startTag.normalName = "img";
assertEquals("\uFFFC", replacement.replace(startTag));
final HtmlTag.Inline img = new InlineImpl("img", -1, Collections.<String, String>emptyMap());
assertEquals("\uFFFC", replacement.replace(img));
}
@Test
public void imageReplacementAlt() {
final Token.StartTag startTag = new Token.StartTag();
startTag.normalName = "img";
startTag.attributes = new Attributes().put("alt", "alternative27");
assertEquals("alternative27", replacement.replace(startTag));
final HtmlTag.Inline img = new InlineImpl(
"img",
-1,
Collections.singletonMap("alt", "alternative27")
);
assertEquals("alternative27", replacement.replace(img));
}
@Test
public void brAddsNewLine() {
final Token.StartTag startTag = new Token.StartTag();
startTag.normalName = "br";
startTag.selfClosing = true;
assertEquals("\n", replacement.replace(startTag));
final HtmlTag.Inline br = new InlineImpl(
"br",
-1,
Collections.<String, String>emptyMap()
);
assertEquals("\n", replacement.replace(br));
}
}

View File

@ -17,7 +17,6 @@ import java.util.Set;
import ru.noties.markwon.html.api.HtmlTag;
import ru.noties.markwon.html.api.MarkwonHtmlParser;
import ru.noties.markwon.html.impl.jsoup.parser.Token;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
@ -36,8 +35,8 @@ public class MarkwonHtmlParserImplTest {
final MarkwonHtmlParserImpl impl = MarkwonHtmlParserImpl.create(new HtmlEmptyTagReplacement() {
@Nullable
@Override
public String replace(@NonNull Token.StartTag startTag) {
return startTag.normalName;
public String replace(@NonNull HtmlTag tag) {
return tag.name();
}
});
@ -98,7 +97,7 @@ public class MarkwonHtmlParserImplTest {
final MarkwonHtmlParserImpl impl = MarkwonHtmlParserImpl.create(new HtmlEmptyTagReplacement() {
@Nullable
@Override
public String replace(@NonNull Token.StartTag startTag) {
public String replace(@NonNull HtmlTag tag) {
return null;
}
});
@ -143,7 +142,7 @@ public class MarkwonHtmlParserImplTest {
final MarkwonHtmlParserImpl impl = MarkwonHtmlParserImpl.create(new HtmlEmptyTagReplacement() {
@Nullable
@Override
public String replace(@NonNull Token.StartTag startTag) {
public String replace(@NonNull HtmlTag tag) {
return null;
}
});
@ -212,7 +211,7 @@ public class MarkwonHtmlParserImplTest {
final MarkwonHtmlParserImpl impl = MarkwonHtmlParserImpl.create(new HtmlEmptyTagReplacement() {
@Nullable
@Override
public String replace(@NonNull Token.StartTag startTag) {
public String replace(@NonNull HtmlTag tag) {
return null;
}
});
@ -278,10 +277,9 @@ public class MarkwonHtmlParserImplTest {
);
final MarkwonHtmlParserImpl impl = MarkwonHtmlParserImpl.create(new HtmlEmptyTagReplacement() {
@Nullable
@Override
public String replace(@NonNull Token.StartTag startTag) {
return startTag.normalName;
public String replace(@NonNull HtmlTag tag) {
return tag.name();
}
});
@ -473,8 +471,6 @@ public class MarkwonHtmlParserImplTest {
final StringBuilder output = new StringBuilder();
impl.processFragment(output, "<DiV><I>italic <eM>emphasis</Em> italic</i></dIv>");
System.out.printf("output: `%s`%n", output);
final CaptureInlineTagsAction inlineTagsAction = new CaptureInlineTagsAction();
final CaptureBlockTagsAction blockTagsAction = new CaptureBlockTagsAction();
@ -804,6 +800,39 @@ public class MarkwonHtmlParserImplTest {
assertEquals(Arrays.toString(split), 5, split.length);
}
@Test
public void attributesAreLowerCase() {
final MarkwonHtmlParserImpl impl = MarkwonHtmlParserImpl.create();
final StringBuilder output = new StringBuilder();
impl.processFragment(output, "<i CLASS=\"my-class\" dIsAbLeD @HeLLo=\"there\">");
final CaptureInlineTagsAction action = new CaptureInlineTagsAction();
impl.flushInlineTags(output.length(), action);
assertTrue(action.called);
assertEquals(1, action.tags.size());
with(action.tags.get(0), new Action<HtmlTag.Inline>() {
@Override
public void apply(@NonNull HtmlTag.Inline inline) {
assertEquals("i", inline.name());
with(inline.attributes(), new Action<Map<String, String>>() {
@Override
public void apply(@NonNull Map<String, String> map) {
assertEquals(3, map.size());
assertEquals("my-class", map.get("class"));
assertEquals("", map.get("disabled"));
assertEquals("there", map.get("@hello"));
}
});
}
});
}
private static class CaptureTagsAction<T> implements MarkwonHtmlParser.FlushAction<T> {
boolean called;

View File

@ -22,13 +22,12 @@ import java.util.Iterator;
@SuppressWarnings({"WeakerAccess", "unused"})
public class SpannableBuilder implements Appendable, CharSequence {
/**
* @since 2.0.0
*/
public static void setSpans(@NonNull SpannableBuilder builder, @Nullable Object spans, int start, int end) {
if (spans != null) {
// let's filter non-valid positions here, so there is no need to validate
// it whilst applying non-closed html tags
//
// setting a span for an invalid position can lead to silent fail (no exception,
// but execution is stopped)
if (!isPositionValid(builder.length(), start, end)) {

View File

@ -22,6 +22,11 @@ public abstract class TagHandler {
TagHandler handler;
for (HtmlTag.Block child : block.children()) {
if (!child.isClosed()) {
continue;
}
handler = configuration.htmlRenderer().tagHandler(child.name());
if (handler != null) {
handler.handle(configuration, builder, child);