I am studying Java and trying to write an HTML parser, which should parse tag names and attributes. I wrote a class (code below) using the State pattern.
This is necessary for my training project, where I currently use JSoup. JSoup is too slow for me though, so I want better performance. Although suggestions about following conventions and best practise are also good. Additionally, comments on the interface / API of my class would be appreciated too.
import java.io.BufferedReader;
import java.io.IOException;
import java.util.HashMap;
public class AttributeParser {
public AttributeParser(BufferedReader reader) {
this.reader = reader;
states.put(AttrStat.NAME, new NameState());
states.put(AttrStat.VALUE, new ValueState());
states.put(AttrStat.VALUE_QUOTES, new ValueQuotesState());
states.put(AttrStat.AFTER_NAME, new AfterNameState());
states.put(AttrStat.NEW_ATTR, new NewAttrState());
states.put(AttrStat.NEW_VALUE, new NewValueState());
current = states.get(AttrStat.NEW_ATTR);
}
public String tag() throws IOException {
int ch;
ch = reader.read();
while (ch > 0) {
if (ch == '<') {
StringBuilder tagName = new StringBuilder();
ch = reader.read();
while ((ch > 0) && (" >\n\t".indexOf(ch) == -1)) {
reader.mark(1);
tagName.append((char) ch);
if (tagName.toString().equals("!--")) {
break;
}
ch = reader.read();
}
if (ch == '>') {
reader.reset();
}
return tagName.toString();
}
ch = reader.read();
}
return null;
}
public HashMap<String, String> attribute() throws IOException {
attr = new HashMap<>();
while (current.read(reader.read())) {
//without body
}
addAttribute();
return attr;
}
private void addAttribute() {
if ((name.length() > 0) && !name.toString().equals("/") && (attr.get(name.toString()) == null)) {
attr.put(name.toString(), value.toString());
}
name.setLength(0);
value.setLength(0);
}
abstract class State {
final public boolean read(int ch) {
if (ch == -1) return false;
switch (ch) {
case '>':
return readAngleBracket((char) ch);
case '=':
return readEqual((char) ch);
case '\n':
case ' ':
case '\t':
return readBreaker((char) ch);
//double quote
case '"':
case '\'':
return readSequence((char) ch);
default:
return readChar((char) ch);
}
}
protected abstract boolean readChar(char ch);
protected abstract boolean readSequence(char ch);
protected abstract boolean readBreaker(char ch);
protected abstract boolean readEqual(char ch);
protected boolean readAngleBracket(char ch) {
addAttribute();
return false;
}
}
private class NameState extends State {
protected boolean readChar(char ch) {
name.append(ch);
return true;
}
protected boolean readSequence(char ch) {
name.append(ch);
return true;
}
protected boolean readBreaker(char ch) {
current = states.get(AttrStat.AFTER_NAME);
return true;
}
protected boolean readEqual(char ch) {
current = states.get(AttrStat.NEW_VALUE);
return true;
}
}
private class ValueState extends State {
protected boolean readChar(char ch) {
value.append(ch);
return true;
}
protected boolean readSequence(char ch) {
value.append(ch);
return true;
}
protected boolean readBreaker(char ch) {
addAttribute();
current = states.get(AttrStat.NEW_ATTR);
return true;
}
protected boolean readEqual(char ch) {
value.append(ch);
return true;
}
}
private class ValueQuotesState extends State {
protected boolean readChar(char ch) {
value.append(ch);
return true;
}
protected boolean readSequence(char ch) {
if (quotes == ch) {
addAttribute();
current = states.get(AttrStat.NEW_ATTR);
} else {
value.append(ch);
}
return true;
}
protected boolean readBreaker(char ch) {
value.append(ch);
return true;
}
protected boolean readEqual(char ch) {
value.append(ch);
return true;
}
protected boolean readAngleBracket(char ch) {
value.append(ch);
return true;
}
}
private class AfterNameState extends State {
protected boolean readChar(char ch) {
addAttribute();
current = states.get(AttrStat.NEW_ATTR);
name.append(ch);
return true;
}
protected boolean readSequence(char ch) {
addAttribute();
name.append(ch);
current = states.get(AttrStat.NEW_ATTR);
return true;
}
protected boolean readBreaker(char ch) {
return true;
}
protected boolean readEqual(char ch) {
current = states.get(AttrStat.NEW_VALUE);
return true;
}
}
private class NewAttrState extends State {
protected boolean readChar(char ch) {
name.append(ch);
current = states.get(AttrStat.NAME);
return true;
}
protected boolean readSequence(char ch) {
name.append(ch);
current = states.get(AttrStat.NAME);
return true;
}
protected boolean readBreaker(char ch) {
addAttribute();
return true;
}
protected boolean readEqual(char ch) {
name.append(ch);
current = states.get(AttrStat.NAME);
return true;
}
}
private class NewValueState extends State {
protected boolean readChar(char ch) {
value.append(ch);
current = states.get(AttrStat.VALUE);
return true;
}
protected boolean readSequence(char ch) {
quotes = ch;
current = states.get(AttrStat.VALUE_QUOTES);
return true;
}
protected boolean readBreaker(char ch) {
return true;
}
protected boolean readEqual(char ch) {
value.append(ch);
current = states.get(AttrStat.VALUE);
return true;
}
}
enum AttrStat {NAME, VALUE, VALUE_QUOTES, AFTER_NAME, NEW_ATTR, NEW_VALUE}
private BufferedReader reader;
private StringBuilder name = new StringBuilder();
private StringBuilder value = new StringBuilder();
private State current;
private char quotes = ' ';
private HashMap<String, String> attr;
private HashMap<AttrStat, State> states = new HashMap<>();
}