First, parse your xml file to obtain a file(s) that contain all of the info you need.
For example, if you want to have just one table with the attributes clsid, entryid, semType, baseForm, variant(writtenform), variant(type), dc(att), dc(val) then you just need one file that has these attributes (separated with some character). Each line in file would correspond to each row in table.
Next, you create table schema in Postgresql. Then use Postgresql's COPY command, which copies all of the data from file to table.
Note that if your xml file is huge you should use event based parser. Something like SAX, StAX in Java for example.
EDIT
*NOTE*: libraries used: stax2-api-3.1.1.jar, woodstox-core-asl-4.1.1jar
Here is the code (hopefully it does what you need, if not I am sure it gets you started):
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package test;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.net.MalformedURLException;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import java.util.ArrayList;
import org.codehaus.stax2.XMLInputFactory2;
import org.codehaus.stax2.XMLStreamReader2;
public class Main {
/**
* @param args the command line arguments
*/
/*
* dc(att), dc(val)
*/
@SuppressWarnings("CallToThreadDumpStack")
public static void main(String[] args) throws MalformedURLException, IOException, XMLStreamException {
FileInputStream fstream = new FileInputStream(args[0]);
Reader in = new InputStreamReader(fstream, "UTF-8");
XMLInputFactory2 factory = (XMLInputFactory2) XMLInputFactory.newInstance();
XMLStreamReader2 parser = (XMLStreamReader2) factory.createXMLStreamReader(in);
FileOutputStream outStream = new FileOutputStream("/home/aseke/Desktop/out.txt");
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(outStream, "UTF-8"));
boolean isCluster = false;
ArrayList<String> dc = new ArrayList<String>();
ArrayList<String> variants = new ArrayList<String>();
/* You actually do not need all of these variables, it's just for clarity */
String clsID = null;
String semType = null;
String varWritten = null;
String varType = null;
String entryID = null;
String baseForm = null;
String dcAtt = null;
String dcVal = null;
String s = null;
while (true) {
int event = parser.next();
if (event == XMLStreamConstants.END_DOCUMENT) {
parser.close();
break;
}
if (event == XMLStreamConstants.START_ELEMENT) {
String tag = parser.getLocalName();
if (tag.equals("Cluster")) {
isCluster = true;
clsID = parser.getAttributeValue(0);
semType = parser.getAttributeValue(1);
} else if (tag.equals("Entry") && isCluster) {
entryID = parser.getAttributeValue(0);
baseForm = parser.getAttributeValue(1);
} else if (tag.equals("Variant") && isCluster) {
varWritten = parser.getAttributeValue(0);
varType = parser.getAttributeValue(1);
variants.add(varWritten + "~" + varType);
} else if (tag.equals("DC") && isCluster) {
dcAtt = parser.getAttributeValue(0);
dcVal = parser.getAttributeValue(1);
dc.add(dcAtt + "~" + dcVal);
}
}
if (event == XMLStreamConstants.END_ELEMENT && isCluster) {
if (parser.getLocalName().equals("Cluster")) {
isCluster = false;
//clsid, entryid, semType, baseForm, variant(writtenform), variant(type), dc(att), dc(val)
// Use tabs as delimiter for Postgre COPY
String outStr = clsID + "/t" + entryID + "/t" + semType + "/t" + baseForm + "/t";
/* Add all variants */
for (String var : variants) {
String tmp[] = var.split("~");
varWritten = tmp[0];
varType = tmp[1];
outStr += varWritten + "/t" + varType + "/t";
}
/* Add al DCs */
for (String ss : dc) {
String[] tmp = ss.split("~");
dcAtt = tmp[0];
dcVal = tmp[1];
outStr += dcAtt + "/t" + dcVal + "/t";
}
// remove last tab "\t"
outStr = outStr.substring(0, outStr.length() - 2);
out.write(outStr);
variants.clear();
dc.clear();
}
}
}
// close all streams
fstream.close();
out.close();
outStream.close();
}
}
I formatted you input xml. So input file looks like this:
<Cluster clsId="UNIPR_NIRI_PARDP" semType="geneProt">
<Entry entryId="UNIPR_NIRI_PARDP_1" baseForm="Protein nirI" type="PREFERRED">
<Variant WRITTENFORM="FMN-binding domain protein" type="orthographic"/>
<Variant WRITTENFORM="FMN-binding domain-containing protein" type="orthographic"/>
<Variant WRITTENFORM="unknown" type="orthographic"/>
<Variant WRITTENFORM="FMN-binding" type="orthographic"/>
<Variant WRITTENFORM="Pden_2486" type="orthographic"/>
<Variant WRITTENFORM="nirI" type="orthographic"/>
<SourceDC sourceName="BioThesaurus" sourceId="Q51699"/>
<PosDC posName="POS" pos="N"/>
<DC att="uniprot_ac" val="Q51699"/>
<DC att="speciesNameNCBI" val="318586"/>
</Entry>
</Cluster>
Output looks like this. Note that it is delimited with tabs. Tabs will be later used as a delimiter in Postgre COPY command. You can change delimiter to any other.
UNIPR_NIRI_PARDP/tUNIPR_NIRI_PARDP_1/tgeneProt/tProtein nirI/tFMN-binding domain protein/torthographic/tFMN-binding domain-containing protein/torthographic/tunknown/torthographic/tFMN-binding/torthographic/tPden_2486/torthographic/tnirI/torthographic/tuniprot_ac/tQ51699/tspeciesNameNCBI/t318586