An idea:
private BufferedReader getReader(final String fileUrl) throws IOException {
final String filename = getFilename(fileUrl) + ".gz";
final URL url = new URL(fileUrl);
final InputStream stream;
if (new File(filename).isFile()) {
stream = ClassLoader.getSystemResourceAsStream(fileUrl);
System.out.println("Using tdat header from classes directory");
} else {
stream = url.openStream();
}
final GZIPInputStream gzipStream = new GZIPInputStream(stream);
final InputStreamReader gzipStreamReader =
new InputStreamReader(gzipStream, "UTF-8");
final BufferedReader reader = new BufferedReader(gzipStreamReader);
return reader;
}
You don't have to close
the GZIPInputStream
, Reader.close()
does it.
I'd invert the condition inside the while
loop:
// start processing
while (reader.ready()) {
final String line = reader.readLine();
if (!line.matches("^(.*?\\|)*$")) {
continue;
}
Map<String, String> result = new HashMap<String, String>();
...
}
It makes the code flatten which is easier to read.
This code maybe unnecessary, since no one uses the template
˛object:
// create a template so I only have to create a map once
final Map<String, String> template =
new LinkedHashMap<String, String>(catalog.getFieldData().size());
for (final String fieldName : catalog.getFieldData().keySet()) {
template.put(fieldName, null);
}
You should specify the charset when you call the constructor of the InputStreamReader
.
final InputStreamReader gzipStreamReader =
new InputStreamReader(gzipStream, "UTF-8");
Omitting it could lead to 'interesting' surprises, since it will use the default charset which vary from system to system.
Here is the code after a few more method extractions. Check the comments please and feel free to ask if something isn't clear.
public void importTdatFile(final String fileUrl) throws MyAppException {
try {
doImport(fileUrl);
} catch (final IOException e) {
// MalformedURLException is a subclass of IOException
throw new MyAppException("Cannot import", e);
}
}
private void doImport(final String fileUrl) throws IOException {
BufferedReader reader = null;
BufferedWriter writer = null;
try {
reader = getReader(fileUrl);
writer = getWriter();
while (reader.ready()) {
final String line = reader.readLine();
final Map<String, String> results = processLine(line);
filterResults(results);
final String jsonLine = getJsonLine(results);
writer.write(jsonLine);
}
} finally {
closeQuetly(reader);
closeQuetly(writer);
}
}
private BufferedReader getReader(final String fileUrl) throws IOException {
final InputStream stream = getStream(fileUrl);
final BufferedReader reader = createGzipReader(stream);
return reader;
}
private InputStream getStream(final String fileUrl)
throws MalformedURLException, IOException {
final InputStream stream;
// I don't really like this 'if' here
if (isGzipFile(fileUrl)) {
// I'm not sure that the condition is correct for classpath loading
// or not
stream = ClassLoader.getSystemResourceAsStream(fileUrl);
// I would put this println to somewhere else (after refactoring the 'if')
System.out.println("Using tdat header from classes directory");
} else {
final URL url = new URL(fileUrl);
stream = url.openStream();
}
return stream;
}
private BufferedReader createGzipReader(final InputStream stream)
throws IOException, UnsupportedEncodingException {
final GZIPInputStream gzipStream = new GZIPInputStream(stream);
final InputStreamReader gzipStreamReader =
new InputStreamReader(gzipStream, "UTF-8");
final BufferedReader reader = new BufferedReader(gzipStreamReader);
return reader;
}
private boolean isGzipFile(final String fileUrl) {
final String filename = getFilename(fileUrl) + ".gz";
return new File(filename).isFile();
}
private BufferedWriter getWriter() throws IOException {
// TODO: FileWriter use the default character encoding (see javadoc),
// maybe you should use a FileOutputStream with a specified encoding.
final String outputFilename = getOutputFilename();
final FileWriter fileWriter = new FileWriter(outputFilename);
return new BufferedWriter(fileWriter);
}
private String getOutputFilename() {
return catalog.getName() + ".json";
}
private Map<String, String> processLine(final String line) {
final Map<String, String> result = new HashMap<String, String>();
if (!isProcessableLine(line)) {
return result;
}
// It's hard to refactor without the internals of catalog.
final String[] fieldValues = line.split("\\|");
for (int i = 0; i < fieldValues.length; i++) {
// TODO: possible ArrayIndexOutOfBoundsException here?
final String fieldName = catalog.getFieldName(i);
final FieldData fieldData = catalog.getFieldData(fieldName);
if (catalog.fieldDataSetContains(fieldData)) {
final String fieldValue = fieldValues[i];
result.put(fieldName, fieldValue);
}
}
return result;
}
private void filterResults(final Map<String, String> results) {
removeNulls(results);
// TODO: catalog probably a field, so it should be visible inside these
// methods without passing them as a parameter
removeUnwantedFields(results, catalog);
fixFieldPrefixes(results, catalog);
fixFieldNames(results, catalog);
}
private boolean isProcessableLine(final String line) {
// TODO: A precompiled regexp maybe faster but it would be premature
// optimization, so don't do that without profiling
return line.matches("^(.*?\\|)*$");
}
private void closeQuetly(final Closeable closeable) {
if (closeable == null) {
return;
}
try {
closeable.close();
} catch (final IOException e) {
// TODO: log the exception with a logger instead of the
// printStackTrace();
e.printStackTrace();
}
}
Anyway, your streaming approach is fine, you shouldn't read the whole file into the memory.