diff --git a/README.md b/README.md index a438fb0fe..0a8dbd0da 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,7 @@ Each library contains detailed readme and instructions on how to use it. In addi | [commasrl](commasrl/README.md) | This software extracts relations that commas participate in. | | [similarity](similarity/README.md) | This software compare objects --especially Strings-- and return a score indicating how similar they are. | | [temporal-normalizer](temporal-normalizer/README.md) | A temporal extractor and normalizer. | +| [dataless-classifier](dataless-classifier/README.md) | Classifies text into a user-specified label hierarchy from just the textual label descriptions | | [external-annotators](external/README.md) | A collection useful external annotators. | diff --git a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/ViewNames.java b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/ViewNames.java index 102e16796..fd788635a 100644 --- a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/ViewNames.java +++ b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/ViewNames.java @@ -88,6 +88,9 @@ public class ViewNames { public static final String WIKIFIER = "WIKIFIER"; + public static final String DATALESS_ESA = "DATALESS_ESA"; + public static final String DATALESS_W2V = "DATALESS_W2V"; + /** * @deprecated Replaced by {@link #CLAUSES_CHARNIAK}, {@link #CLAUSES_BERKELEY}, * {@link #CLAUSES_STANFORD} @@ -150,6 +153,8 @@ public static ViewTypes getViewType(String viewName) { case SHALLOW_PARSE: case QUANTITIES: case WIKIFIER: + case DATALESS_ESA: + case DATALESS_W2V: case CLAUSES_CHARNIAK: case CLAUSES_STANFORD: case CLAUSES_BERKELEY: diff --git a/dataless-classifier/README.md b/dataless-classifier/README.md new file mode 100644 index 000000000..6b5478172 --- /dev/null +++ b/dataless-classifier/README.md @@ -0,0 +1,45 @@ +# CogComp-DatalessClassifier +Given a label ontology, and textual descriptions of those labels, Dataless-Classifier is capable of classifying arbitrary text into that ontology. + +It is particularly useful in those scenarios where it is difficult/expensive to gather enough training data to train a supervised text classifier. Dataless-Classifier utilizes the semantic meaning of the labels to bypass the need for explicit supervision. For more information, please visit our main project [page](http://cogcomp.org/page/project_view/6). + + +Some key points: +- The Main classes for the Dataless Annotators are: + * **ESADatalessAnnotator** for the ESA-based Dataless Annotator + * **W2VDatalessAnnotator** for the Word2Vec-based Dataless Annotator +- Dataless Annotators add the **DATALESS_ESA** and **DATALESS_W2V** views to the input `TextAnnotation` respectively, and it requires the presence of a **TOKENS** view with the end-user's desired Tokenization. +- Since Labels/Topics are inferred at the Document-Level, all topic annotations span the entire document. +- Sample invocation has been provided in the main functions of each annotator. +- Both annotators load up embeddings in memory, and thus can easily consume upto **10GB RAM**. + + +## Label Hierarchy +Dataless Classification requires the end-user to specifcy a Label hierarchy (with label descriptions), which it classifies into. The Label hierarchy needs to be provided using a very specific format: +* **labelNamePath**: Specify your label id to label name mapping here in the `labelID \t labelName` format + (label id can be any ID specific to your system, however we use the label name itself as ID in our sample hierachy for readibility) +* **labelHierarchyPath**: The first line of this file should contain tab-separated list of Top-Level nodes in the hierarchy (i.e. the ones directly connected to the root). Then, every following line should specify the connections in the hierachy in the `parentLabelID \t childLabelID1 \t childLabelID2 \t ...` format. +* **labelDescPath**: Dataless' performance hinges on good label descriptions, which you specify in this file in the `labelID \t labelDescription` format. + +We provide a sample 20newsgroups hierarchy with label descriptions inside data/hierarchy/20newsgroups, where: +* idToLabelNameMap.txt should be used as labelNamePath +* parentChildIdMap.txt should be used as labelHierarchyPath +* labelDesc\_Kws\_simple.txt should be used as labelDescPath + +We also provide improved 20newsgroups label descriptions in *labelDesc\_Kws\_embellished.txt* which corresponds to the label descriptions used in [2], whereas the *labelDesc\_Kws\_simple.txt* corresponds to the label descriptions used in [1]. + +## Embeddings +ESA and Word2Vec Embeddings are fetched from the DataStore on demand. + +## Config +A sample config file with the default values has been provided in the config folder .. *config/project.properties* + +To check whether you are properly set to use the project or not, run: +* `mvn -Dtest=ESADatalessTest#testPredictions test` to test the ESADatalessAnnotator. +* `mvn -Dtest=W2VDatalessTest#testPredictions test` to test the W2VDatalessAnnotator. + +If you use this software for research, please cite the following papers: + +[1] Chang, Ming-Wei, et al. "Importance of Semantic Representation: Dataless Classification." AAAI. Vol. 2. 2008. + +[2] Song, Yangqiu, and Dan Roth. "On Dataless Hierarchical Text Classification." AAAI. Vol. 7. 2014. diff --git a/dataless-classifier/config/project.properties b/dataless-classifier/config/project.properties new file mode 100644 index 000000000..709029944 --- /dev/null +++ b/dataless-classifier/config/project.properties @@ -0,0 +1,25 @@ +## Use ResourceManager to read these properties +# curatorHost = trollope.cs.illinois.edu +# curatorPort = 9010 + +## Target Label Hierarchy +labelHierarchyPath = data/hierarchies/20newsgroups/parentChildIdMap.txt +labelNamePath = data/hierarchies/20newsgroups/idToLabelNameMap.txt +labelDescPath = data/hierarchies/20newsgroups/labelDesc_Kws_simple.txt +# labelDescPath = data/hierarchies/20newsgroups/labelDesc_Kws_embellished.txt + +## Classifier configuration +inferenceBottomUp = True +classifierThreshold = 0.99 +classifierLeastK = 1 +classifierMaxK = 3 + +## ESA Configuration +#esaPath = data/embeddings/esaEmbedding/esa_vectors.txt +#esaMapPath = data/embeddings/esaEmbedding/idToConceptMap.txt +#esaDimension = 100 + +## W2V Configuration +#w2vPath = data/embeddings/w2vEmbedding-100/w2v_vectors.txt +#w2vDimension = 200 + diff --git a/dataless-classifier/data/electronicsTestDocument.txt b/dataless-classifier/data/electronicsTestDocument.txt new file mode 100644 index 000000000..799d39f7a --- /dev/null +++ b/dataless-classifier/data/electronicsTestDocument.txt @@ -0,0 +1 @@ +yes i know it s nowhere near christmas time but i m gonna loose net access in a few days maybe a week or if i m lucky and wanted to post this for interested people to save till xmas note bell labs is a good place if you have a phd and a good boss i have neither subject xmas light set with levels of brightness another version of a variable brightness xmas light set this set starts with a blinker bulb string diagram orginal way set 0v b b 0rtn modified set for level brightness string 0v 0k w string b 0v rtn note no mods to wiring to the right of this point only one blinker is used note that the blinker would not have as much current thru it as the string bulbs because of the second string of bulbs in parallel with it that s why the use of the 0k w resistor here to add extra current thru the blinker to make up for the current shunted thru the second string while the blinker is glowing and the second string is not glowing when the blinker goes open this resistor has only a slight effect on the brightness of the strings s slightly dimmer s slightly brighter or use a w 0v bulb in place of the 0k resistor if you can get one caution do not replace with a standard c bulb as these draw too much current and burn out the blinker c approx w what you ll see when it s working powerup string will light at full brightness and b will be lit bypassing most of the current from the second string making them not light b will open placing both strings in series making the string that was out to glow at a low brightness and the other string that was on before to glow at reduced brightness be sure to wire and insulate the splices resistor leads and cut wires in a safe manner level brightness xmas light set for easter diff --git a/dataless-classifier/data/graphicsTestDocument.txt b/dataless-classifier/data/graphicsTestDocument.txt new file mode 100644 index 000000000..f7e5d430f --- /dev/null +++ b/dataless-classifier/data/graphicsTestDocument.txt @@ -0,0 +1 @@ +i m looking for some recommendations for screen capture programs a couple of issues ago pc mag listed as editor s choices both conversion artist and hijaak for windows anyone have any experience with those or some others i m trying to get an alpha manual in the next few days and i m not making much progress with the screen shots i m currently using dodot and i m about to burn it and the disks it rode it on it s got a lot of freaky bugs and oversights that are driving me crazy tonight it decided that for any graphic it writes out as a tiff file that s under a certain arbitrary size it will swap the left and right sides of the picture usually it confines itself to not copying things to the clipboard so i have to save and load pix for editing in paintbrush or crashing every hour or so the one nice thing it has though is it s dither option you d think that this would turn colors into dots which it does if you go from say colors to colors but if you go from or colors to b w you can set a threshold level for which colors turn to black and which turn to white for me this is useful because i can turn light grays on buttons to white and the dark grays to black and thereby preserve the d effect on buttons and other parts of the window if you understood my description can you tell me if another less buggy program can do this as well much thanks for any help signature david delgreco what lies behind us and what lies technically a writer before us are tiny matters compared delgreco rahul net to what lies within us oliver wendell holmes david f delgreco delgreco rahul net recommendation for screen capture program diff --git a/dataless-classifier/data/hierarchies/20newsgroups/idToLabelNameMap.txt b/dataless-classifier/data/hierarchies/20newsgroups/idToLabelNameMap.txt new file mode 100644 index 000000000..48034a682 --- /dev/null +++ b/dataless-classifier/data/hierarchies/20newsgroups/idToLabelNameMap.txt @@ -0,0 +1,26 @@ +politics politics +religion religion +computer computer +autos.sports autos.sports +science science +sales sales +talk.politics.guns talk.politics.guns +talk.politics.mideast talk.politics.mideast +talk.politics.misc talk.politics.misc +alt.atheism alt.atheism +soc.religion.christian soc.religion.christian +talk.religion.misc talk.religion.misc +comp.sys.ibm.pc.hardware comp.sys.ibm.pc.hardware +comp.sys.mac.hardware comp.sys.mac.hardware +comp.graphics comp.graphics +comp.windows.x comp.windows.x +comp.os.ms.windows.misc comp.os.ms.windows.misc +rec.autos rec.autos +rec.motorcycles rec.motorcycles +rec.sport.baseball rec.sport.baseball +rec.sport.hockey rec.sport.hockey +sci.electronics sci.electronics +sci.crypt sci.crypt +sci.med sci.med +sci.space sci.space +misc.forsale misc.forsale diff --git a/dataless-classifier/data/hierarchies/20newsgroups/labelDesc_Kws_embellished.txt b/dataless-classifier/data/hierarchies/20newsgroups/labelDesc_Kws_embellished.txt new file mode 100644 index 000000000..c2b10da6f --- /dev/null +++ b/dataless-classifier/data/hierarchies/20newsgroups/labelDesc_Kws_embellished.txt @@ -0,0 +1,26 @@ +politics politics gun fbi guns weapon compound israel arab jews jewish muslim gay homosexual sexual +religion religion atheist christian atheism god islamic christian god christ church bible jesus christian morality jesus god religion horus +computer computer bus pc motherboard bios board computer dos mac apple powerbook graphics image gif animation tiff window motif xterm sun windows windows dos microsoft ms driver drivers card printer +autos.sports autos.sports car ford auto toyota honda nissan bmw bike motorcycle yamaha baseball ball hitter hockey wings espn +science science circuit electronics radio signal battery encryption key crypto algorithm security doctor medical disease medicine patient space orbit moon earth sky solar +sales sales sale offer shipping forsale sell price brand obo +talk.politics.guns gun fbi guns weapon compound +talk.politics.mideast israel arab jews jewish muslim +talk.politics.misc gay homosexual sexual +alt.atheism atheist christian atheism god islamic +soc.religion.christian christian god christ church bible jesus +talk.religion.misc christian morality jesus god religion horus +comp.sys.ibm.pc.hardware bus pc motherboard bios board computer dos +comp.sys.mac.hardware mac apple powerbook +comp.graphics graphics image gif animation tiff +comp.windows.x window motif xterm sun windows +comp.os.ms.windows.misc windows dos microsoft ms driver drivers card printer +rec.autos car ford auto toyota honda nissan bmw +rec.motorcycles bike motorcycle yamaha +rec.sport.baseball baseball ball hitter +rec.sport.hockey hockey wings espn +sci.electronics circuit electronics radio signal battery +sci.crypt encryption key crypto algorithm security +sci.med doctor medical disease medicine patient +sci.space space orbit moon earth sky solar +misc.forsale sale offer shipping forsale sell price brand obo diff --git a/dataless-classifier/data/hierarchies/20newsgroups/labelDesc_Kws_simple.txt b/dataless-classifier/data/hierarchies/20newsgroups/labelDesc_Kws_simple.txt new file mode 100644 index 000000000..0c0a7ce7b --- /dev/null +++ b/dataless-classifier/data/hierarchies/20newsgroups/labelDesc_Kws_simple.txt @@ -0,0 +1,26 @@ +politics politics politics guns politics mideast politics +religion religion atheism society religion christianity christian religion +computer computer computer systems ibm pc hardware computer systems mac macintosh apple hardware computer graphics computer windows x windowsx computer os operating system microsoft windows +autos.sports autos.sports cars motorcycles baseball hockey +science science science electronics science cryptography medicine science space +sales sales for sale discount +talk.politics.guns politics guns +talk.politics.mideast politics mideast +talk.politics.misc politics +alt.atheism atheism +soc.religion.christian society religion christianity christian +talk.religion.misc religion +comp.sys.ibm.pc.hardware computer systems ibm pc hardware +comp.sys.mac.hardware computer systems mac macintosh apple hardware +comp.graphics computer graphics +comp.windows.x computer windows x windowsx +comp.os.ms.windows.misc computer os operating system microsoft windows +rec.autos cars +rec.motorcycles motorcycles +rec.sport.baseball baseball +rec.sport.hockey hockey +sci.electronics science electronics +sci.crypt science cryptography +sci.med science medicine +sci.space science space +misc.forsale for sale discount diff --git a/dataless-classifier/data/hierarchies/20newsgroups/parentChildIdMap.txt b/dataless-classifier/data/hierarchies/20newsgroups/parentChildIdMap.txt new file mode 100644 index 000000000..5c49b8bbd --- /dev/null +++ b/dataless-classifier/data/hierarchies/20newsgroups/parentChildIdMap.txt @@ -0,0 +1,7 @@ +politics religion computer autos.sports science sales +politics talk.politics.guns talk.politics.mideast talk.politics.misc +religion alt.atheism soc.religion.christian talk.religion.misc +computer comp.sys.ibm.pc.hardware comp.sys.mac.hardware comp.graphics comp.windows.x comp.os.ms.windows.misc +autos.sports rec.autos rec.motorcycles rec.sport.baseball rec.sport.hockey +science sci.electronics sci.crypt sci.med sci.space +sales misc.forsale \ No newline at end of file diff --git a/dataless-classifier/pom.xml b/dataless-classifier/pom.xml new file mode 100644 index 000000000..77d00b615 --- /dev/null +++ b/dataless-classifier/pom.xml @@ -0,0 +1,71 @@ + + + + illinois-cogcomp-nlp + edu.illinois.cs.cogcomp + 4.0.0 + + + 4.0.0 + + illinois-datalessclassification + Illinois Dataless Classifier + Classifies Text into the given label hierarchy from just the textual label descriptions + + + + org.cogcomp + cogcomp-datastore + 1.9.10 + + + edu.illinois.cs.cogcomp + illinois-core-utilities + 4.0.0 + + + edu.illinois.cs.cogcomp + illinois-tokenizer + 4.0.0 + + + org.slf4j + slf4j-log4j12 + 1.7.12 + true + + + net.sf.jung + jung-api + 2.0.1 + + + net.sf.jung + jung-graph-impl + 2.0.1 + + + commons-cli + commons-cli + 1.4 + + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.20.1 + + + -Xmx15g + + + + + + + + diff --git a/dataless-classifier/script/testESADataless.sh b/dataless-classifier/script/testESADataless.sh new file mode 100644 index 000000000..f2f8520de --- /dev/null +++ b/dataless-classifier/script/testESADataless.sh @@ -0,0 +1,3 @@ +#mvn compile +#mvn dependency:copy-dependencies +nice java -Xmx10g -cp ./target/*:./target/dependency/* edu.illinois.cs.cogcomp.datalessclassification.ta.ESADatalessAnnotator $@ diff --git a/dataless-classifier/script/testW2VDataless.sh b/dataless-classifier/script/testW2VDataless.sh new file mode 100644 index 000000000..f7bda7662 --- /dev/null +++ b/dataless-classifier/script/testW2VDataless.sh @@ -0,0 +1,3 @@ +#mvn compile +#mvn dependency:copy-dependencies +nice java -Xmx10g -cp ./target/*:./target/dependency/* edu.illinois.cs.cogcomp.datalessclassification.ta.W2VDatalessAnnotator $@ diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/classifier/AClassifierTree.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/classifier/AClassifierTree.java new file mode 100755 index 000000000..6c4db0a32 --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/classifier/AClassifierTree.java @@ -0,0 +1,214 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.classifier; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import edu.illinois.cs.cogcomp.datalessclassification.hierarchy.SimpleTree; +import edu.illinois.cs.cogcomp.datalessclassification.hierarchy.TreeNode; + +/** + * An Abstract Tree class that has the same link structure as a {@link LabelTree}. + * + * Ideally, different classes will extend this class with different types of Nodes, wherein their corresponding Node + * will contain the additional payload (data) required by their corresponding classifier. + * + * For instance, a simple nearest-neighbor based Dataless Classifier requires a Tree wherein each node contains the label + * representation, which is achieved by extending this class ({@link ConceptTree}), and the corresponding Node class {@link ConceptTreeNode}). + * + * @author shashank + */ + +public abstract class AClassifierTree extends SimpleTree { + + private static final long serialVersionUID = 1L; + + protected String root_label; + protected LabelTree labelTree; + + public AClassifierTree(LabelTree labelTree) { + super(); + setLabelTree(labelTree); + initializeTreeStructure(); + } + + protected boolean isLabelTreeInitialized() { + return (!(labelTree == null)); + } + + @SuppressWarnings("unchecked") + protected void initializeRoot(String root_label) { + T root = (T) T.makeBasicNode(root_label); + initializeRoot(root); + } + + public String getRootLabel() { + return root_label; + } + + protected void setLabelTree(LabelTree labelTree) { + if (isLabelTreeInitialized()) + return; + + this.root_label = labelTree.getRoot().getLabelID(); + initializeRoot(root_label); + this.labelTree = labelTree; + } + + public LabelTree getLabelTree() { + return labelTree; + } + + @SuppressWarnings("unchecked") + public Set getChildren(String label) { + Set set = getChildren((T) T.makeBasicNode(label)); + + if (set == null) + return null; + + if (set.isEmpty()) + return Collections.emptySet(); + + Set newSet = new HashSet(set.size()); + + newSet.addAll(set); + + return newSet; + } + + @SuppressWarnings("unchecked") + public T getParent(String label) { + T parent = getParent((T) T.makeBasicNode(label)); + return parent; + } + + @SuppressWarnings("unchecked") + public boolean addEdge(String parent, String child) { + T parentNode = (T) T.makeBasicNode(parent); + T childNode = (T) T.makeBasicNode(child); + + return addEdge(parentNode, childNode); + } + + public boolean addEdges(String parent, Set children) { + boolean success = true; + + for (String child : children) { + success = addEdge(parent, child); + + if (!success) + break; + } + + return success; + } + + public Set getLeafLabels() { + Set set = labelTree.getLeafLabels(); + return set; + } + + @SuppressWarnings("unchecked") + public boolean isLeaf(String label) { + T node = (T) T.makeBasicNode(label); + return isLeaf(node); + } + + @SuppressWarnings("unchecked") + public int getDepth(String label) { + return getDepth((T) T.makeBasicNode(label)); + } + + @SuppressWarnings("unchecked") + public List getAllParents(String label) { + List parentNodes = getAllParents((T) T.makeBasicNode(label)); + + if (parentNodes == null) + return null; + + List parents = new ArrayList<>(parentNodes.size()); + + parents.addAll(parentNodes); + + return parents; + } + + public List getAllParentLabels(String label) { + List parentNodes = getAllParents(label); + + if (parentNodes == null) + return null; + + List parents = new ArrayList<>(parentNodes.size()); + + for (T p : parentNodes) { + parents.add(p.getLabelID()); + } + + return parents; + } + + @SuppressWarnings("unchecked") + public T getNodeFromLabel(String label) { + T node = getNode((T) T.makeBasicNode(label)); + + return node; + } + + public Set getSameLevelNodes(String label) { + int depth = getDepth(label); + + if (depth == -1) + return null; + + List nodes = getBreadthOrderedNodeList(); + + Set output = new HashSet<>(); + + for (T node : nodes) { + int thisDepth = getDepth(node); + + if (thisDepth > depth) + break; + else if (thisDepth == depth) + output.add(node); + } + + return output; + } + + public Set getSameLevelLabels(String label) { + Set nodes = getSameLevelNodes(label); + + if (nodes == null) + return null; + + Set output = new HashSet<>(nodes.size()); + + for (T p : nodes) { + output.add(p.getLabelID()); + } + + return output; + } + + public void initializeTreeStructure() { + List nodes = labelTree.getBreadthOrderedLabelList(); + + for (String node : nodes) { + if (!labelTree.isLeaf(node)) { + Set children = labelTree.getChildren(node); + addEdges(node, children); + } + } + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/classifier/ConceptTree.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/classifier/ConceptTree.java new file mode 100755 index 000000000..8bb016aff --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/classifier/ConceptTree.java @@ -0,0 +1,362 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.classifier; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import edu.illinois.cs.cogcomp.datalessclassification.representation.AEmbedding; +import edu.illinois.cs.cogcomp.datalessclassification.util.SparseVector; +import edu.illinois.cs.cogcomp.datalessclassification.util.SparseVectorOperations; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A ConceptTree has the same link structure as a {@link LabelTree}, with the addition that + * it contains vector representations for each node (labelID). + * + * @author yqsong@illinois.edu + * @author shashank + */ + +public class ConceptTree extends AClassifierTree> { + + private static Logger logger = LoggerFactory.getLogger(ConceptTree.class); + + private static final long serialVersionUID = 1L; + + private final static String NIL = "NIL"; + + private transient Map globalConceptWeights; + + private transient AEmbedding embedding; + + protected transient int numConcepts; + + public ConceptTree(LabelTree labelTree) { + super(labelTree); + } + + public ConceptTree(LabelTree labelTree, AEmbedding embedding, Map conceptWeights) { + this(labelTree, embedding, conceptWeights, 500); + } + + public ConceptTree(LabelTree labelTree, AEmbedding embedding, Map conceptWeights, + int embeddingSize) { + super(labelTree); + this.embedding = embedding; + this.globalConceptWeights = conceptWeights; + this.numConcepts = embeddingSize; + + initializeTree(); + } + + /** + * Generates and Returns a ConceptTree using the provided LabelTree, and the + * LabelID -> Embeddings Map + */ + public static ConceptTree generateDenseEmbeddedTreeFromLabelEmbeddingMap( + LabelTree labelTree, Map> labelEmbeddings) { + ConceptTree conceptTree = new ConceptTree<>(labelTree); + + for (ConceptTreeNode node : conceptTree.getNodes()) { + String labelID = node.getLabelID(); + node.setLabelDescription(labelTree.getLabelDescription(labelID)); + + SparseVector conceptVector = labelEmbeddings.get(labelID); + node.setConceptVector(conceptVector); + } + + return conceptTree; + } + + /** + * Generates and Returns a ConceptTree using the provided LabelTree, and the + * File containing the String representation of the LabelID -> Embeddings Map + */ + public static ConceptTree generateDenseEmbeddedTreeFromFile( + LabelTree labelTree, String repFile) { + logger.info("Reading Label Embeddings from " + repFile); + File inputFile = new File(repFile); + + Map> labelEmbeddings = new HashMap<>(); + + try(BufferedReader bf = new BufferedReader(new FileReader(inputFile))) { + String line; + + while ((line = bf.readLine()) != null) { + line = line.trim(); + + if (line.length() == 0) + continue; + + String[] tokens = line.trim().split("\t", 2); + String[] stringVec = tokens[1].split(" "); + + String label = tokens[0].trim(); + + if (label.length() == 0) + continue; + + Map scores = new HashMap<>(); + + int i = 0; + + for (String dim : stringVec) { + scores.put(i, Double.parseDouble(dim)); + i++; + } + + SparseVector vec = new SparseVector<>(scores); + + labelEmbeddings.put(label, vec); + } + + } catch (FileNotFoundException e) { + e.printStackTrace(); + logger.error("File not found at " + repFile); + throw new RuntimeException("File not found at " + repFile); + } catch (IOException e) { + e.printStackTrace(); + logger.error("Error while reading file"); + throw new RuntimeException("Error while reading file"); + } + + ConceptTree conceptTree = + generateDenseEmbeddedTreeFromLabelEmbeddingMap(labelTree, labelEmbeddings); + return conceptTree; + } + + /** + * Copy Constructor + */ + public ConceptTree(ConceptTree thatTree) { + super(new LabelTree(thatTree.getLabelTree())); + + for (ConceptTreeNode node : getNodes()) { + if (isRoot(node)) + continue; + + String labelID = node.getLabelID(); + ConceptTreeNode thatNode = thatTree.getNodeFromLabel(labelID); + + String description = thatNode.getLabelDescription(); + node.setLabelDescription(description); + + SparseVector vector = SparseVector.deepCopy(thatNode.getConceptVector()); + node.setConceptVector(vector); + } + } + + + /** + * Initializes the Root node of the tree + */ + @Override + protected void initializeRoot(String root_label) { + ConceptTreeNode root = ConceptTreeNode.makeBasicTypedNode(root_label); + initializeRoot(root); + } + + + /** + * Returns all the ChildNodes of a particular node (labelID) + */ + @Override + public Set> getChildren(String label) { + Set> set = + getChildren(ConceptTreeNode.makeBasicTypedNode(label)); + + if (set == null) + return null; + + if (set.isEmpty()) + return Collections.emptySet(); + + Set> newSet = new HashSet<>(set.size()); + + newSet.addAll(set); + + return newSet; + } + + + /** + * Returns the Parent Node of a particular node (labelID) + */ + @Override + public ConceptTreeNode getParent(String label) { + ConceptTreeNode parent = + getParent(ConceptTreeNode.makeBasicTypedNode(label)); + return parent; + } + + + /** + * Adds an edge between a ParentNode and a ChildNode + */ + @Override + public boolean addEdge(String parent, String child) { + ConceptTreeNode parentNode = ConceptTreeNode.makeBasicTypedNode(parent); + ConceptTreeNode childNode = ConceptTreeNode.makeBasicTypedNode(child); + + return addEdge(parentNode, childNode); + } + + /** + * Returns the depth of a particular node (labelID) + */ + public int getDepth(String label) { + return getDepth(ConceptTreeNode.makeBasicTypedNode(label)); + } + + /** + * Returns all the parent nodes of a particular node (labelID) + */ + public List> getAllParents(String label) { + List> parentNodes = + getAllParents(ConceptTreeNode.makeBasicTypedNode(label)); + + if (parentNodes == null) + return null; + + List> parents = new ArrayList<>(parentNodes.size()); + + parents.addAll(parentNodes); + + return parentNodes; + } + + /** + * This function initializes the representations of the nodes using the LabelTree and the Embedding Objects + * -- Uses the LabelTree as the Tree Structure, and + * -- Uses the labelDescription of each node to get the corresponding vector representation + */ + public void initializeTree() { + for (ConceptTreeNode node : getNodes()) { + String labelID = node.getLabelID(); + String description = labelTree.getLabelDescription(labelID); + + node.setLabelDescription(description); + + SparseVector concepts = + embedding.getVector(node.getLabelDescription(), numConcepts); + concepts.updateNorm(globalConceptWeights); + + node.setConceptVector(concepts); + } + } + + /** + * This Utility function takes multiple ConceptTrees as input, and returns a ConceptTree + * that averages the representations at each node. + */ + public static ConceptTree getAvgConceptTree( + List> conceptTreeList) { + ConceptTree avgTree = new ConceptTree<>(conceptTreeList.get(0)); + + for (ConceptTreeNode node : avgTree.getNodes()) { + if (avgTree.isRoot(node)) + continue; + + String currentLabelID = node.getLabelID(); + List> vectors = new ArrayList<>(); + + for (ConceptTree tree : conceptTreeList) { + vectors.add(tree.getNodeFromLabel(currentLabelID).getConceptVector()); + } + + SparseVector avgVector = SparseVectorOperations.averageMultipleVectors(vectors); + node.setConceptVector(avgVector); + } + + return avgTree; + } + + /** + * This Utility function dumps a text representation of the tree to the disk. + */ + public void dumpTreeAsString(String filePath) { + try(FileWriter writer = new FileWriter(filePath)) { + + List> nodeList = getBreadthOrderedNodeList(); + + for (ConceptTreeNode node : nodeList) { + String parent; + + if (isRoot(node)) + parent = NIL; + else + parent = getParent(node).getLabelID(); + + writer.write(parent + "\t" + node.getLabelID() + "\t" + node.getLabelDescription() + + "\t" + node.getConceptVector().toString() + "\n"); + } + + } catch (IOException e) { + e.printStackTrace(); + logger.error("Error writing to file at " + filePath); + throw new RuntimeException("Error writing to file at " + filePath); + } + } + + @SuppressWarnings("unchecked") + /** + * Reads and returns a serialized ConceptTree from a file + */ + public static ConceptTree loadTree(String labelRepFile) { + try(ObjectInputStream in = new ObjectInputStream(new FileInputStream(labelRepFile))) { + ConceptTree tree = (ConceptTree) in.readObject(); + return tree; + } catch (FileNotFoundException e) { + e.printStackTrace(); + logger.error("File not found at " + labelRepFile); + throw new RuntimeException("File not found at " + labelRepFile); + } catch (IOException e) { + e.printStackTrace(); + logger.error("Error reading from file"); + throw new RuntimeException("Error reading from file"); + } catch (ClassNotFoundException e) { + e.printStackTrace(); + logger.error("Error deserializing the ConceptTree"); + throw new RuntimeException("Error deserializing the ConceptTree"); + } + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(""); + + List> nodes = getBreadthOrderedNodeList(); + + for (ConceptTreeNode node : nodes) { + if (!isLeaf(node)) { + for (ConceptTreeNode child : getChildren(node)) { + sb.append(node.getLabelID()).append("\t"); + sb.append(child).append("\n"); + } + } + } + + return sb.toString(); + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/classifier/ConceptTreeNode.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/classifier/ConceptTreeNode.java new file mode 100755 index 000000000..71bd9f197 --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/classifier/ConceptTreeNode.java @@ -0,0 +1,122 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.classifier; + +import java.io.Serializable; + +import edu.illinois.cs.cogcomp.datalessclassification.hierarchy.TreeNode; +import edu.illinois.cs.cogcomp.datalessclassification.util.SparseVector; + +/** + * + * The Node Class used by {@link ConceptTree} internally. + * Wraps labelDescription and vector representation of the node within it. + * + * ConceptTreeNode is to {@link ConceptTree}, as {@link LabelTreeNode} is to {@link LabelTree} + * + * Each ConceptTreeNode contains the vector representation for a node (labelID) in the LabelTree. + * + * @author yqsong@illinois.edu + * @author shashank + */ + +public class ConceptTreeNode extends TreeNode { + + private static final long serialVersionUID = 1L; + + private String labelDescription; + private SparseVector conceptVector; + + /** + * A convenience factory function to create a basic ConceptTreeNode + */ + public static ConceptTreeNode makeBasicTypedNode(String labelID) { + ConceptTreeNode node = new ConceptTreeNode<>("", labelID, null); + return node; + } + + /** + * A convenience factory function to create a ConceptTreeNode + */ + public static ConceptTreeNode makeNode(String labelDesc, + String labelID, SparseVector conceptVector) { + ConceptTreeNode node = new ConceptTreeNode<>(labelDesc, labelID, conceptVector); + return node; + } + + public ConceptTreeNode(String labelID) { + this("", labelID); + } + + public ConceptTreeNode(String labelDesc, String labelID) { + this(labelDesc, labelID, null); + } + + public ConceptTreeNode(String labelDesc, String labelID, SparseVector conceptVector) { + super(labelID); + setLabelDescription(labelDesc); + setConceptVector(conceptVector); + } + + /** + * Returns the labelDescription of the node + */ + public String getLabelDescription() { + return this.labelDescription; + } + + /** + * Sets the labelDescription of the node + */ + public void setLabelDescription(String labelDesc) { + this.labelDescription = labelDesc; + } + + /** + * Returns the vector representation of the node + */ + public SparseVector getConceptVector() { + return this.conceptVector; + } + + /** + * Sets the vector representation of the node + */ + public void setConceptVector(SparseVector vector) { + if (vector == null) + vector = new SparseVector<>(); + + this.conceptVector = vector; + } + + @SuppressWarnings("unchecked") + @Override + public boolean equals(Object o) { + if (!(o instanceof ConceptTreeNode)) + return false; + + ConceptTreeNode other = (ConceptTreeNode) o; + + return this.labelID.equals(other.getLabelID()); + } + + @Override + public int hashCode() { + return labelID.hashCode(); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(""); + + sb.append(labelID).append("\t"); + sb.append(conceptVector); + + return sb.toString(); + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/classifier/DatalessClassifierML.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/classifier/DatalessClassifierML.java new file mode 100755 index 000000000..4e7aadc14 --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/classifier/DatalessClassifierML.java @@ -0,0 +1,362 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.classifier; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; + +import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager; +import edu.illinois.cs.cogcomp.datalessclassification.config.DatalessConfigurator; +import edu.illinois.cs.cogcomp.datalessclassification.util.HashSort; +import edu.illinois.cs.cogcomp.datalessclassification.util.LabelScorePair; +import edu.illinois.cs.cogcomp.datalessclassification.util.LabelResultTree; +import edu.illinois.cs.cogcomp.datalessclassification.util.LabelResultTreeNode; +import edu.illinois.cs.cogcomp.datalessclassification.util.SparseVector; +import edu.illinois.cs.cogcomp.datalessclassification.util.SparseVectorOperations; + +/** + * The class which implements various Inference algorithms for the Multi-Label Hierarchical Dataless Classification. + * - Supports both Bottom-Up and Top-Down inference + * - Provides support for controlling the minimum and maximum number of labels selected at each level + * - Provides support for controlling the number of labels selected at a level based on the cumulative similarity score + * - Provides functions to retrieve just a flat-list of selected labels, or the full depth-level classification information + * + * @author yqsong@illinois.edu + * @author shashank + */ +public class DatalessClassifierML implements IConceptClassificationTree { + private ConceptTree conceptTree; + + private boolean bottomUp; + + private double classifierThreshold; + private int classifierLeastK; + private int classifierMaxK; + + public DatalessClassifierML(ResourceManager config, ConceptTree conceptTree) { + this.conceptTree = conceptTree; + + this.bottomUp = config.getBoolean(DatalessConfigurator.BottomUp_Inference.key); + this.classifierThreshold = config.getDouble(DatalessConfigurator.classifierThreshold.key); + this.classifierLeastK = config.getInt(DatalessConfigurator.classifierLeastK.key); + this.classifierMaxK = config.getInt(DatalessConfigurator.classifierMaxK.key); + } + + @Override + public Map> getFullDepthPredictions(SparseVector docVector) { + return getFullDepthPredictions(docVector, new HashMap<>()); + } + + /** + * Overrides getFullDepthPredictions to provide additional functionality for providing different weights to different dimensions + * of the underlying embedding + */ + public Map> getFullDepthPredictions(SparseVector docVector, + Map conceptWeights) { + if (bottomUp) + return getFullDepthPredictionsBottomUp(docVector, conceptWeights); + else + return getFullDepthPredictionsTopDown(docVector, conceptWeights); + } + + /** + * Selects some leaf nodes (using either TopK or a score based Threshold), and then selects their path to the root as the + * output label set, while reusing the scores of the leaf labels. + * + * Returns a Map, where key is the depth, and value is a list of selected labelIDs at that depth with their absolute similarity scores + */ + private Map> getFullDepthPredictionsBottomUp(SparseVector docVector, Map conceptWeights) { + double classifierMLThreshold = classifierThreshold; + int leastK = classifierLeastK; + int maxK = classifierMaxK; + + Set> leafSet = conceptTree.getLeafSet(); + + Map orgSimilarities = new HashMap<>(); + Map normalizedSimilarities = new HashMap<>(); + + + /** + * We calculate normalized similarities so as to be able to threshold at a particular + * (absolute) value while selecting the labels + */ + double maxSimilarity = 0 - Double.MAX_VALUE; + double minSimilarity = Double.MAX_VALUE; + + for (ConceptTreeNode leafNode : leafSet) { + double similarity = + SparseVectorOperations.cosine(leafNode.getConceptVector(), docVector, + conceptWeights); + orgSimilarities.put(leafNode.getLabelID(), similarity); + + if (similarity > maxSimilarity) { + maxSimilarity = similarity; + } + + if (similarity < minSimilarity) { + minSimilarity = similarity; + } + } + + if (minSimilarity < 0) { + for (String labelID : orgSimilarities.keySet()) { + orgSimilarities.put(labelID, orgSimilarities.get(labelID) - minSimilarity); + maxSimilarity = maxSimilarity - minSimilarity; + minSimilarity = 0; + } + } + + double sumSimilarity = 0; + + for (String leafLabel : orgSimilarities.keySet()) { + double value = + (orgSimilarities.get(leafLabel) - minSimilarity) + / (maxSimilarity - minSimilarity + Double.MIN_VALUE); + + if (orgSimilarities.size() == 1) { + value = 1; + } + + normalizedSimilarities.put(leafLabel, value); + sumSimilarity += value; + } + + for (String leafLabel : normalizedSimilarities.keySet()) { + normalizedSimilarities.put(leafLabel, normalizedSimilarities.get(leafLabel) + / (sumSimilarity + Double.MIN_VALUE)); + } + + Map> depthLabelMap = new HashMap<>(); + + TreeMap sortedSimilarities = HashSort.sortByValues(normalizedSimilarities); + + double ratio = 0; + int labelCount = 0; + + /** + * Basically the portion of the code below selects certain leaf nodes (either by similarity + * threshold or by topK), and selects their path to the root in the tree -- with their + * scores being used as the scores of their leaf nodes. + */ + + for (String leafLabelID : sortedSimilarities.keySet()) { + ratio += normalizedSimilarities.get(leafLabelID); + + if ((ratio < classifierMLThreshold && labelCount < maxK) || labelCount < leastK) { + String labelID = leafLabelID; + double leafSimilarity = orgSimilarities.get(leafLabelID); + + while (labelID != null) { + int depth = conceptTree.getDepth(labelID); + + if (!depthLabelMap.containsKey(depth)) { + depthLabelMap.put(depth, new ArrayList<>()); + } + + LabelScorePair labelPair = new LabelScorePair(labelID, leafSimilarity); + depthLabelMap.get(depth).add(labelPair); + + labelID = conceptTree.getLabelTree().getParent(labelID); + } + } + + labelCount++; + } + + return depthLabelMap; + } + + /** + * Gets the DepthPredictions (using either bottomUp or topDown) and then just returns + * a flat-bag of selected labelIDs (independent of their depth in the tree) + * + * Use this function if you just want a flat-list of selected labelIDs from the tree, where only topK labels + * have been selected at each level + */ + @Override + public Set getFlatPredictions(SparseVector docVector, int topK) { + Map> testDepthLabelMap = getPrunedDepthPredictions(docVector, topK); + + Set predictedLabels = new HashSet<>(); + + for (Set labels : testDepthLabelMap.values()) { + predictedLabels.addAll(labels); + } + + return predictedLabels; + } + + /** + * Gets the FullPredictions (using either bottomUp or topDown) and then selects at most topK labels at each level + * + * Return a Map, where key is the Depth, and the value is the Set of selected labelIDs at the depth + * + * Use this function when you want the depth information associated with the selected labelIDs as well, and want to + * limit the number of labels selected at each depth + */ + @Override + public Map> getPrunedDepthPredictions(SparseVector docVector, int topK) { + Map> testDepthLabelMap = new HashMap<>(); + + Map> labelResultsInDepth = getFullDepthPredictions(docVector); + + for (int depth : labelResultsInDepth.keySet()) { + /** + TODO: This block assumes that Depth = 0 will always be the Root Node of the Tree + TODO: However, instead of the actual root node provided by the end-user, the underlying tree implementation might use a placeholder + TODO: for the root node, and thus this check might lead to some logical errors later. + + TODO: Thus, this is very risky and needs to go once the labelTree and ConceptTree classes have been refactored + */ + if (depth == 0) + continue; + + List classifiedLabelList = labelResultsInDepth.get(depth); + + if (classifiedLabelList == null) { + classifiedLabelList = new ArrayList<>(); + } + + Set classifiedLabelSet = new HashSet<>(); + + for (int i = 0; i < Math.min(topK, classifiedLabelList.size()); i++) { + //This check is currently required since labelIDs are represented as a String, and thus they might clash + if (!classifiedLabelSet.contains(classifiedLabelList.get(i).getLabelID())) + classifiedLabelSet.add(classifiedLabelList.get(i).getLabelID()); + } + + testDepthLabelMap.put(depth, classifiedLabelSet); + } + + return testDepthLabelMap; + } + + /** + * Selects at most K children per node, while traversing Top-Down in the tree. + */ + private Map> getFullDepthPredictionsTopDown(SparseVector documentConceptVector, Map conceptWeights) { + LabelResultTree labelResult = new LabelResultTree(); + LabelScorePair labelPair = new LabelScorePair(conceptTree.getRootLabel(), 1); + + LabelResultTreeNode resultTreeRootNode = labelResult.getRootNode(); + + resultTreeRootNode.setLabelScorePair(labelPair); + resultTreeRootNode.setDepth(0); + + retrieveLabelTopDown(documentConceptVector, conceptTree.getRoot(), resultTreeRootNode, conceptWeights); + + Map> labelResultsInDepth = labelResult.getFullDepthPredictions(); + return labelResultsInDepth; + } + + /** + * Recursive function + * + * Overall, given the Root of a ConceptTree, this function selects at most K children for each Node, + * creates a corresponding LabelResultTree, and returns the root of that Tree. + */ + private void retrieveLabelTopDown(SparseVector docConceptVector, + ConceptTreeNode conceptTreeRootNode, LabelResultTreeNode resultTreeRootNode, + Map conceptWeights) { + + int maxK = classifierMaxK; + + Map orgSimilarities = new HashMap<>(); + Map similarities = new HashMap<>(); + Map> labelIdNodeMap = new HashMap<>(); + + double maxSimilarity = 0 - Double.MAX_VALUE; + double minSimilarity = Double.MAX_VALUE; + + for (ConceptTreeNode childNode : conceptTree.getChildren(conceptTreeRootNode)) { + double similarity = + SparseVectorOperations.cosine(docConceptVector, childNode.getConceptVector(), + conceptWeights); + + orgSimilarities.put(childNode.getLabelID(), similarity); + labelIdNodeMap.put(childNode.getLabelID(), childNode); + + if (similarity > maxSimilarity) { + maxSimilarity = similarity; + } + + if (similarity < minSimilarity) { + minSimilarity = similarity; + } + } + + if (minSimilarity < 0) { + for (String labelID : orgSimilarities.keySet()) { + orgSimilarities.put(labelID, orgSimilarities.get(labelID) - minSimilarity); + maxSimilarity = maxSimilarity - minSimilarity; + minSimilarity = 0; + } + } + + double sumSimilarity = 0; + + for (String labelID : orgSimilarities.keySet()) { + double value = + (orgSimilarities.get(labelID) - minSimilarity) + / (maxSimilarity - minSimilarity + Double.MIN_VALUE); + + if (orgSimilarities.size() == 1) { + value = 1; + } + + similarities.put(labelID, value); + sumSimilarity += value; + } + + for (String labelID : similarities.keySet()) { + similarities.put(labelID, similarities.get(labelID) / (sumSimilarity + Double.MIN_VALUE)); + } + + if (conceptTree.getChildCount(conceptTreeRootNode) == 0) { + resultTreeRootNode.setIsLeaf(true); + } else { + resultTreeRootNode.setIsLeaf(false); + } + + TreeMap sortedSimilarities = HashSort.sortByValues(similarities); + + int labelCount = 0; + + List labelResultChildNodes = resultTreeRootNode.getChildren(); + + if (sumSimilarity > 0) { + for (String labelID : sortedSimilarities.keySet()) { + if (labelCount < maxK && similarities.get(labelID) > 0) { + LabelScorePair labelPair = + new LabelScorePair(labelID, orgSimilarities.get(labelID)); + + LabelResultTreeNode labelResultChildNode = new LabelResultTreeNode(); + labelResultChildNode.setLabelScorePair(labelPair); + labelResultChildNode.setDepth(resultTreeRootNode.getDepth() + 1); + + labelResultChildNodes.add(labelResultChildNode); + + retrieveLabelTopDown(docConceptVector, labelIdNodeMap.get(labelID), labelResultChildNode, + conceptWeights); + } + + labelCount++; + + if (labelCount >= maxK) { + break; + } + } + } + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/classifier/IConceptClassificationTree.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/classifier/IConceptClassificationTree.java new file mode 100755 index 000000000..259ebafd1 --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/classifier/IConceptClassificationTree.java @@ -0,0 +1,50 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.classifier; + +import java.io.Serializable; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import edu.illinois.cs.cogcomp.datalessclassification.util.LabelScorePair; +import edu.illinois.cs.cogcomp.datalessclassification.util.SparseVector; + +/** + * An Inference Interface, to be implemented by all variants of Dataless Classifier + * + * @author yqsong@illinois.edu + * @author shashank + */ + +public interface IConceptClassificationTree { + + /** + * Returns a Map, where key is the depth, and value is a list of selected labelIDs at that depth with their absolute similarity scores + * + * If a particular implementation wants the end-user to be able to select a particular inference algorithm, this function + * should internally redirect to the relevant functions + */ + Map> getFullDepthPredictions(SparseVector vector); + + + /** + * Return a Map, where key is the Depth, and the value is the Set of selected topK labelIDs at that depth + * + * Should ideally call getFullDepthPredictions internally and select the topK labels at each depth. + * + * Use this function when you want the depth information associated with the selected labelIDs as well, and want to + * limit the number of labels selected at each depth + */ + Map> getPrunedDepthPredictions(SparseVector docVector, int topK); + + /** + * Returns just a flat-bag of selected labelIDs (independent of their depth in the tree) + */ + Set getFlatPredictions(SparseVector docVector, int topK); +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/classifier/LabelTree.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/classifier/LabelTree.java new file mode 100755 index 000000000..431c28c57 --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/classifier/LabelTree.java @@ -0,0 +1,553 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.classifier; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import edu.illinois.cs.cogcomp.datalessclassification.hierarchy.SimpleTree; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class represents the user-desired label ontology/hierarchy, in which each node has a + * - labelID + * - labelName + * - labelDescription + * + * @author shashank + */ + +public class LabelTree extends SimpleTree { + + private static Logger logger = LoggerFactory.getLogger(LabelTree.class); + private static final long serialVersionUID = 1L; + + private String root_label; + + public LabelTree() { + this("root"); + } + + public LabelTree(String root) { + super(); + this.root_label = root; + initializeRoot(root_label); + } + + /** + * Copy Constructor + */ + public LabelTree(LabelTree thatTree) { + this(thatTree.root_label); + + Set topNodes = thatTree.getChildren(thatTree.root_label); + + Map> childMap = new HashMap<>(); + Map labelNameMap = new HashMap<>(); + Map labelDescriptionMap = new HashMap<>(); + + for (LabelTreeNode labelNode : thatTree.getNodes()) { + String labelID = labelNode.getLabelID(); + + if (!thatTree.isLeaf(labelNode)) { + childMap.put(labelID, thatTree.getChildren(labelID)); + } + + labelNameMap.put(labelID, labelNode.getLabelName()); + labelDescriptionMap.put(labelID, labelNode.getLabelDescription()); + } + + initializeTree(topNodes, childMap, labelNameMap, labelDescriptionMap); + } + + /** + * Initializes the root node of the tree + */ + private void initializeRoot(String root_label) { + LabelTreeNode root = LabelTreeNode.makeBasicNode(root_label); + initializeRoot(root); + } + + /** + * Initializes the Tree completely, given the: + * @param topNodes: Set containing the labelIDs of the top-level nodes in the tree + * @param childMap: Map containing the parentID - childIds mapping + * @param labelNameMap: Map containing the labelID - labelName mapping + * @param labelDescriptionMap: Map containing the labelID - labelDescription mapping + */ + public void initializeTree(Set topNodes, Map> childMap, + Map labelNameMap, Map labelDescriptionMap) { + + initializeTreeStructure(topNodes, childMap); + initializeLabelNames(labelNameMap); + initializeLabelDescriptions(labelDescriptionMap); + } + + /** + * Initializes the structure of the Tree, given the: + * @param topNodes: Set containing the labelIDs of the top-level nodes in the tree + * @param childMap: Map containing the parentID - childIds mapping + */ + public void initializeTreeStructure(Set topNodes, Map> childMap) { + List exploreNodes = new ArrayList<>(); + + for (String topNode : topNodes) + addEdge(root_label, topNode); + + exploreNodes.addAll(topNodes); + + while (exploreNodes.size() != 0) { + String node = exploreNodes.get(0); + + if (childMap.containsKey(node)) { + Set children = childMap.get(node); + addEdges(node, children); + exploreNodes.addAll(children); + } + + exploreNodes.remove(0); + } + } + + /** + * Clears the labelNames for all the nodes + */ + public void clearLabelNames() { + for (LabelTreeNode node : getNodes()) + node.setLabelName(""); + } + + /** + * Sets the labelNames from the labelID -> labelName map + */ + public void initializeLabelNames(Map labelNameMap) { + for (String labelId : labelNameMap.keySet()) + setLabelName(labelId, labelNameMap.get(labelId)); + } + + /** + * Clears the labelDescriptions for all the nodes + */ + public void clearLabelDescriptions() { + for (LabelTreeNode node : getNodes()) + node.setLabelDescription(""); + } + + /** + * Sets the labelDescriptions from the labelID -> labelDescription map + */ + public void initializeLabelDescriptions(Map labelDescriptionMap) { + for (String labelId : labelDescriptionMap.keySet()) + setLabelDescription(labelId, labelDescriptionMap.get(labelId)); + } + + /** + * Adds a new node to the Tree, given the: + * @param parent: the labelId of the parent (the parent node should exist before adding the children) + * @param labelId: the labelId of the node + * @param labelName: the labelName of the node + * @param labelDesc: the labelDescription of the node + */ + protected boolean addNode(String parent, String labelId, String labelName, String labelDesc) { + LabelTreeNode node = new LabelTreeNode(labelId, labelName, labelDesc); + LabelTreeNode parentNode = LabelTreeNode.makeBasicNode(parent); + + return addEdge(parentNode, node); + } + + /** + * Returns the labelIDs of the children of a particular node (labelID) + */ + public Set getChildren(String labelId) { + Set set = getChildren(LabelTreeNode.makeBasicNode(labelId)); + + if (set == null) + return null; + + if (set.isEmpty()) + return Collections.emptySet(); + + Set labelSet = new HashSet<>(set.size()); + + for (LabelTreeNode node : set) { + labelSet.add(node.getLabelID()); + } + + return labelSet; + } + + /** + * Returns the labelIDs of the leaf nodes in the tree + */ + public Set getLeafLabels() { + Set set = getLeafSet(); + + if (set == null) + return null; + + if (set.isEmpty()) + return Collections.emptySet(); + + Set newSet = new HashSet<>(set.size()); + + for (LabelTreeNode node : set) { + newSet.add(node.getLabelID()); + } + + return newSet; + } + + /** + * Returns the labelID of a parent of a particular node (labelID) + */ + public String getParent(String labelId) { + LabelTreeNode parent = getParent(LabelTreeNode.makeBasicNode(labelId)); + + if (parent == null) + return null; + + return parent.getLabelID(); + } + + /** + * Returns the labelName of a particular node (labelID) in the tree + */ + public String getLabelName(String labelId) { + LabelTreeNode node = getNode(LabelTreeNode.makeBasicNode(labelId)); + + if (node == null) + return null; + + return node.getLabelName(); + } + + /** + * Sets the labelName of a particular node (labelID) in the tree + */ + public boolean setLabelName(String labelId, String labelName) { + LabelTreeNode node = getNode(LabelTreeNode.makeBasicNode(labelId)); + + if (node == null) + return false; + + node.setLabelName(labelName); + return true; + } + + /** + * Returns the labelDescription of a particular node (labelID) in the tree + */ + public String getLabelDescription(String labelId) { + LabelTreeNode node = getNode(LabelTreeNode.makeBasicNode(labelId)); + + if (node == null) + return null; + + return node.getLabelDescription(); + } + + /** + * Sets the labelDescription of a particular node (labelID) in the tree + */ + public boolean setLabelDescription(String labelId, String labelDesc) { + LabelTreeNode node = getNode(LabelTreeNode.makeBasicNode(labelId)); + + if (node == null) + return false; + + node.setLabelDescription(labelDesc); + return true; + } + + /** + * Adds an edge between a parentNode and a childNode + */ + protected boolean addEdge(String parent, String child) { + LabelTreeNode parentNode = LabelTreeNode.makeBasicNode(parent); + LabelTreeNode childNode = LabelTreeNode.makeBasicNode(child); + + return addEdge(parentNode, childNode); + } + + /** + * Adds edges between a parent node and child nodes + */ + protected boolean addEdges(String parent, Set children) { + boolean success = true; + + for (String child : children) { + success = addEdge(parent, child); + + if (!success) + break; + } + + return success; + } + + /** + * Returns the Depth of a particular node (labelID) in the tree + */ + public int getDepth(String labelId) { + LabelTreeNode node = LabelTreeNode.makeBasicNode(labelId); + return getDepth(node); + } + + /** + * Returns all the ancestors of a particular node (labelID) in the tree + */ + public List getAllParents(String labelId) { + List parentNodes = getAllParents(LabelTreeNode.makeBasicNode(labelId)); + + if (parentNodes == null) + return null; + + List parents = new ArrayList<>(parentNodes.size()); + + for (LabelTreeNode p : parentNodes) { + parents.add(p.getLabelID()); + } + + return parents; + } + + /** + * Traverses the Tree in a Breadth-First order and returns the labelIDs + */ + public List getBreadthOrderedLabelList() { + List nodes = getBreadthOrderedNodeList(); + + if (nodes == null) + return null; + + List labelIds = new ArrayList<>(nodes.size()); + + for (LabelTreeNode p : nodes) { + labelIds.add(p.getLabelID()); + } + + return labelIds; + } + + /** + * Traverses the Tree in a (Pre-Order) Depth-First order and returns the labelIDs + */ + public List getDepthOrderedLabelList() { + List nodes = getDepthOrderedNodeList(); + + if (nodes == null) + return null; + + List labelIds = new ArrayList<>(nodes.size()); + + for (LabelTreeNode p : nodes) { + labelIds.add(p.getLabelID()); + } + + return labelIds; + } + + /** + * Returns whether the provided labelID corresponds to a leaf in the Tree or not + */ + public boolean isLeaf(String labelId) { + LabelTreeNode node = LabelTreeNode.makeBasicNode(labelId); + return isLeaf(node); + } + + /** + * A utility function that appends the label descriptions of the child nodes + * to their parents' description + * + * Since nodes in a topic/label hierarchy usually follow IS-A property, this function can enrich + * the descriptions of the parent nodes + */ + public void aggregateChildrenDescription() { + List nodeList = getBreadthOrderedNodeList(); + + Collections.reverse(nodeList); + + for (LabelTreeNode node : nodeList) { + String childDesc = getLabelDescription(node.getLabelID()); + + if (!isRoot(node)) { + LabelTreeNode parent = getParent(node); + String parentDesc = getLabelDescription(parent.getLabelID()); + + String newLabelDesc = parentDesc.trim() + " " + childDesc.trim(); + + setLabelDescription(parent.getLabelID(), newLabelDesc); + } + } + } + + /** + * A Utility function that just appends the labelName to the labelDescription. + */ + public void appendLabelNameToDesc() { + for (LabelTreeNode node : getNodes()) { + String labelId = node.getLabelID(); + + String description = getLabelDescription(labelId) + " " + + getLabelName(labelId); + + node.setLabelDescription(description); + } + } + + /** + * A Utility function that just copies the labelNames to labelDescriptions. + * + * In scenarios, where users don't provide descriptions for their labels, this function can + * be used a last resort for Dataless Classification + */ + public void copyLabelNameToDesc() { + for (LabelTreeNode node : getNodes()) { + String labelId = node.getLabelID(); + String labelName = getLabelName(labelId); + node.setLabelDescription(labelName); + } + } + + /** + * Returns the labelIDs of all nodes at the same level as the provided node (labelID) + */ + public Set getSameLevelLabels(String labelId) { + Set nodes = getSameLevelNodes(labelId); + + if (nodes == null) + return null; + + Set output = new HashSet<>(nodes.size()); + + for (LabelTreeNode p : nodes) { + output.add(p.getLabelID()); + } + + return output; + } + + /** + * Returns all the nodes at the same level as the provided node (labelID) + */ + public Set getSameLevelNodes(String labelId) { + int depth = getDepth(labelId); + + if (depth == -1) + return null; + + List nodes = getBreadthOrderedNodeList(); + + Set output = new HashSet<>(); + + for (LabelTreeNode node : nodes) { + int thisDepth = getDepth(node); + + if (thisDepth > depth) + break; + else if (thisDepth == depth) + output.add(node); + } + + return output; + } + + /** + * A utility function which can be used to identify the top-level nodes in the tree, + * if such an information is not explicitly provided by the end-user. + * + * This function uses the parent-children map to identify the top-level nodes. + */ + public static Set identifyTopNodes(Map> childMap) { + Set candidateTopNodes = new HashSet<>(); + Set topNodes = new HashSet<>(); + + Set children = new HashSet<>(); + + for (String parent : childMap.keySet()) { + if (!children.contains(parent)) + candidateTopNodes.add(parent); + + children.addAll(childMap.get(parent)); + } + + for (String candidate : candidateTopNodes) { + if (!children.contains(candidate)) + topNodes.add(candidate); + } + + return topNodes; + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof LabelTree)) + return false; + + LabelTree that = (LabelTree) o; + + if (!root_label.equals(that.root_label)) + return false; + + if (!super.equals(that)) + return false; + + return true; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(""); + + List nodes = getBreadthOrderedNodeList(); + + for (LabelTreeNode node : nodes) { + if (!isLeaf(node)) { + for (LabelTreeNode child : getChildren(node)) { + sb.append(node.getLabelID()).append("\t").append(child.getLabelID()).append("\n"); + } + } + } + + return sb.toString(); + } + + /** + * This Utility function dumps a text representation of the tree to the disk. + */ + public void dumpTreeLabelDesc(String outPath) { + try(BufferedWriter bw = new BufferedWriter(new FileWriter(new File(outPath)))) { + StringBuilder sb = new StringBuilder(""); + + List nodes = getBreadthOrderedNodeList(); + + for (LabelTreeNode node : nodes) { + if (isRoot(node)) + continue; + + sb.append(node.getLabelID()).append("\t").append(node.getLabelDescription()).append("\n"); + } + + bw.write(sb.toString()); + } catch (IOException e) { + e.printStackTrace(); + logger.error("IO Error while writing file"); + throw new RuntimeException("IO Error while writing file"); + } + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/classifier/LabelTreeNode.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/classifier/LabelTreeNode.java new file mode 100755 index 000000000..f8d6846ef --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/classifier/LabelTreeNode.java @@ -0,0 +1,102 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.classifier; + +import edu.illinois.cs.cogcomp.datalessclassification.hierarchy.TreeNode; + +/** + * The Node Class used by {@link LabelTree} internally. + * Wraps labelID, labelName and labelDescription within it. + * + * @author shashank + */ + +public class LabelTreeNode extends TreeNode { + + private static final long serialVersionUID = 1L; + private String labelName; + private String labelDescription; + + /** + * A convenience factory function to create a basic LabelTreeNode + */ + public static LabelTreeNode makeBasicNode(String labelID) { + LabelTreeNode node = new LabelTreeNode(labelID, "", ""); + return node; + } + + /** + * Copy Constructor + */ + public LabelTreeNode(LabelTreeNode thatNode) { + this(thatNode.getLabelID(), thatNode.getLabelName(), thatNode.getLabelDescription()); + } + + /** + * Initializes the Node with the provided labelID, labelName and labelDescription + */ + LabelTreeNode(String labelID, String labelName, String labelDesc) { + super(labelID); + setLabelName(labelName); + setLabelDescription(labelDesc); + } + + /** + * Gets the LabelDescription for the node + */ + String getLabelDescription() { + return this.labelDescription; + } + + /** + * Sets the LabelDescription for the node + */ + void setLabelDescription(String description) { + this.labelDescription = description; + } + + /** + * Gets the LabelName for the node + */ + String getLabelName() { + return this.labelName; + } + + /** + * Sets the LabelName for the node + */ + void setLabelName(String labelName) { + this.labelName = labelName; + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof LabelTreeNode)) + return false; + + LabelTreeNode other = (LabelTreeNode) o; + + return this.labelID.equals(other.getLabelID()); + } + + @Override + public int hashCode() { + return labelID.hashCode(); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(""); + + sb.append(labelID).append("\t"); + sb.append(labelName).append("\t"); + sb.append(labelDescription); + + return sb.toString(); + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/config/DatalessConfigurator.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/config/DatalessConfigurator.java new file mode 100644 index 000000000..50a882508 --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/config/DatalessConfigurator.java @@ -0,0 +1,34 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.config; + +import edu.illinois.cs.cogcomp.core.utilities.configuration.Configurator; +import edu.illinois.cs.cogcomp.core.utilities.configuration.Property; + +/** + * The basic Configurator used by various variants of the Dataless Annotator + * + * @author shashank + */ +public abstract class DatalessConfigurator extends Configurator { + public static final Property BottomUp_Inference = new Property("inferenceBottomUp", "True"); + + public static final Property JSON_Hierarchy_Path = new Property("jsonHierarchyPath", ""); + + public static final Property LabelHierarchy_Path = new Property("labelHierarchyPath", + "hierarchies/20newsgroups/parentChildIdMap.txt"); + public static final Property LabelName_Path = new Property("labelNamePath", + "hierarchies/20newsgroups/idToLabelNameMap.txt"); + public static final Property LabelDesc_Path = new Property("labelDescPath", + "hierarchies/20newsgroups/labelDesc_Kws_embellished.txt"); + + public static final Property topK = new Property("topK", "1"); + public static final Property classifierThreshold = new Property("classifierThreshold", "0.99"); + public static final Property classifierLeastK = new Property("classifierLeastK", "1"); + public static final Property classifierMaxK = new Property("classifierMaxK", "3"); +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/config/ESADatalessConfigurator.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/config/ESADatalessConfigurator.java new file mode 100644 index 000000000..9b45ecee3 --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/config/ESADatalessConfigurator.java @@ -0,0 +1,37 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.config; + +import edu.illinois.cs.cogcomp.core.utilities.configuration.Property; +import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager; +import edu.illinois.cs.cogcomp.datalessclassification.ta.ESADatalessAnnotator; + +/** + * The Configurator used by {@link ESADatalessAnnotator} + * + * @author shashank + */ +public class ESADatalessConfigurator extends DatalessConfigurator { + + public static final Property ESA_DIM = new Property("esaDimension", "100"); + + /** + * get a ResourceManager object with the default key/value pairs for this configurator + * + * @return a non-null ResourceManager with appropriate values set. + */ + @Override + public ResourceManager getDefaultConfig() { + Property[] props = + {ESA_DIM, BottomUp_Inference, JSON_Hierarchy_Path, LabelHierarchy_Path, + LabelName_Path, LabelDesc_Path, topK, classifierThreshold, + classifierLeastK, classifierMaxK}; + + return new ResourceManager(generateProperties(props)); + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/config/W2VDatalessConfigurator.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/config/W2VDatalessConfigurator.java new file mode 100644 index 000000000..6995183e9 --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/config/W2VDatalessConfigurator.java @@ -0,0 +1,37 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.config; + +import edu.illinois.cs.cogcomp.core.utilities.configuration.Property; +import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager; +import edu.illinois.cs.cogcomp.datalessclassification.ta.W2VDatalessAnnotator; + +/** + * The Configurator used by {@link W2VDatalessAnnotator} + * + * @author shashank + */ +public class W2VDatalessConfigurator extends DatalessConfigurator { + + public static final Property W2V_DIM = new Property("w2vDimension", "100"); + + /** + * get a ResourceManager object with the default key/value pairs for this configurator + * + * @return a non-null ResourceManager with appropriate values set. + */ + @Override + public ResourceManager getDefaultConfig() { + Property[] props = + {W2V_DIM, BottomUp_Inference, JSON_Hierarchy_Path, LabelHierarchy_Path, + LabelName_Path, LabelDesc_Path, topK, classifierThreshold, + classifierLeastK, classifierMaxK}; + + return new ResourceManager(generateProperties(props)); + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/hierarchy/SimpleTree.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/hierarchy/SimpleTree.java new file mode 100644 index 000000000..cbfbe12a0 --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/hierarchy/SimpleTree.java @@ -0,0 +1,322 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.hierarchy; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; + +/** + * A Basic Tree implementation that satisfies project specific needs by wrapping around + * {@link UnorderedTree} + * + * @author shashank + */ + +public class SimpleTree implements Serializable { + + private static final long serialVersionUID = 1L; + + protected UnorderedTree _tree; + + protected int addedEdgeCnt = 0; + + public SimpleTree() { + _tree = new UnorderedTree<>(); + } + + /** + * @param node the node whose number of children is to be returned + * @return the number of children that the {@code node} has + */ + public int getChildCount(N node) { + return _tree.getChildCount(node); + } + + /** + * Returns a set of node's children. If the node has no children then an empty set will be + * returned. + */ + public Set getChildren(N node) { + return _tree.getChildren(node); + } + + /** + * @return the depth of the node in this tree, or -1 if the node is not present in this tree + */ + public int getDepth(N node) { + return _tree.getDepth(node); + } + + /** + * Returns the height of the tree, or -1 if the tree is empty. + */ + public int getHeight() { + return _tree.getHeight(); + } + + public N getParent(N node) { + return _tree.getParent(node); + } + + public N getRoot() { + return _tree.getRoot(); + } + + public N getNode(N node) { + if (!containsNode(node)) + return null; + + if (isRoot(node)) + return _tree.getRoot(); + + int edge = _tree.getParentEdge(node); + + return _tree.getEndpoints(edge).getSecond(); + } + + /** + * Adds the specified node ({@code child}) as a child of the parent node ({@code parent}). + * + * @param parent node (must exist prior to addition) + * @param child node + * @return {@code true} if the graph has been modified + */ + public boolean addEdge(N parent, N child) { + boolean success = _tree.addEdge(addedEdgeCnt + 1, parent, child); + + if (success) + addedEdgeCnt++; + + return success; + } + + /** + * Initializes the tree with the given node. Can only be invoked once i.e. once the root is set, + * invoking this will throw an exception. + * + * @param node to be used as the root + * @return + */ + public boolean initializeRoot(N node) { + return _tree.addVertex(node); + } + + /** + * Returns true if node is a leaf of this tree, i.e., if it has no + * children. + * + * @param node the node to be queried + */ + public boolean isLeaf(N node) { + return _tree.isLeaf(node); + } + + /** + * Returns true iff v1 is the parent of v2. Note that if + * v2 is the root and v1 is null, this method still + * returns true. + */ + public boolean isParent(N v1, N v2) { + return _tree.isPredecessor(v1, v2); + } + + /** + * Returns true if the given node is the root of this tree + * + * @param node the node to be queried + */ + public boolean isRoot(N node) { + return _tree.isRoot(node); + } + + /** + * Returns true iff v1 is the child of v2. Note that if + * v2 is a leaf node and v1 is null, this method returns + * true. + */ + public boolean isChild(N v1, N v2) { + return _tree.isSuccessor(v1, v2); + } + + public boolean containsNode(N node) { + return _tree.containsVertex(node); + } + + public boolean containsEdge(N v1, N v2) { + Integer edgeIndex = _tree.findEdge(v1, v2); + + if (edgeIndex == null) + return false; + else + return true; + } + + public int getNodeCount() { + return _tree.getVertexCount(); + } + + public Set getNodes() { + return _tree.getVertices(); + } + + public List getBreadthOrderedNodeList() { + List output = new ArrayList<>(_tree.getVertexCount()); + List exploreNodes = new ArrayList<>(); + + exploreNodes.add(_tree.getRoot()); + + while (exploreNodes.size() != 0) { + N node = exploreNodes.get(0); + exploreNodes.remove(0); + output.add(node); + + if (!isLeaf(node)) { + Set children = getChildren(node); + exploreNodes.addAll(children); + } + } + + return output; + } + + public List getDepthOrderedNodeList() { + List output = new ArrayList<>(_tree.getVertexCount()); + List exploreNodes = new ArrayList<>(); + + exploreNodes.add(_tree.getRoot()); + + while (exploreNodes.size() != 0) { + N node = exploreNodes.get(0); + exploreNodes.remove(0); + output.add(node); + + if (!isLeaf(node)) { + Set children = getChildren(node); + + for (N child : children) { + exploreNodes.add(0, child); + } + } + } + + return output; + } + + public Set getSiblingsInclusive(N node) { + if (!_tree.containsVertex(node)) + return null; + + if (isRoot(node)) + return Collections.singleton(node); + + Set siblings = new HashSet<>(); + siblings.addAll(getSiblingsExclusive(node)); + siblings.add(node); + + return new ImmutableSet.Builder().addAll(siblings).build(); + } + + public Set getSiblingsExclusive(N node) { + if (!_tree.containsVertex(node)) + return null; + + if (isRoot(node)) + return null; + + N parent = getParent(node); + + Set siblings = new HashSet<>(); + + siblings.addAll(getChildren(parent)); + + siblings.remove(node); + + return new ImmutableSet.Builder().addAll(siblings).build(); + } + + public List getAllParents(N node) { + if (!_tree.containsVertex(node)) + return null; + + if (isRoot(node)) + return null; + + List parents = new ArrayList<>(); + + N child = node; + N parent; + + while ((parent = getParent(child)) != null) { + parents.add(parent); + child = parent; + } + + return new ImmutableList.Builder().addAll(parents).build(); + } + + public Set getLeafSet() { + Set leafSet = new HashSet<>(); + + for (N node : getNodes()) { + if (isLeaf(node)) + leafSet.add(node); + } + + return new ImmutableSet.Builder().addAll(leafSet).build(); + } + + public Set getNodesAtSameLevel(N node) { + if (!containsNode(node)) + return null; + + int depth = getDepth(node); + + return getAllNodesAtDepth(depth); + } + + private Set getAllNodesAtDepth(int depth) { + if (getHeight() < depth) + return null; + + if (getHeight() == depth) + return getLeafSet(); + + Set nodeSet = new HashSet<>(); + + for (N node : getNodes()) { + if (getDepth(node) == depth) + nodeSet.add(node); + } + + return new ImmutableSet.Builder().addAll(nodeSet).build(); + } + + @SuppressWarnings("unchecked") + @Override + public boolean equals(Object o) { + if (!(o instanceof SimpleTree)) + return false; + + SimpleTree that = (SimpleTree) o; + + if (this.addedEdgeCnt != that.addedEdgeCnt) + return false; + + if (!this._tree.equals(that._tree)) + return false; + + return true; + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/hierarchy/TreeNode.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/hierarchy/TreeNode.java new file mode 100755 index 000000000..b5ad52ff5 --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/hierarchy/TreeNode.java @@ -0,0 +1,79 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.hierarchy; + +import java.io.Serializable; + +/** + * The most basic Node class; contains just the ID of the label + * + * @author yqsong@illinois.edu + * @author shashank + */ + +public class TreeNode implements Serializable { + + private static final long serialVersionUID = 1L; + + protected String labelID; + + /** + * A convenience factory function to create a basic TreeNode + */ + public static TreeNode makeBasicNode(String labelID) { + TreeNode node = new TreeNode(labelID); + return node; + } + + /** + * Copy Constructor + */ + public TreeNode(TreeNode thatNode) { + this(thatNode.getLabelID()); + } + + /** + * Initializes the TreeNode with the provided labelID + */ + public TreeNode(String labelID) { + this.labelID = labelID; + } + + /** + * Gets the LabelID for the node + */ + public String getLabelID() { + return this.labelID; + } + + /** + * Sets the LabelID for the node + */ + public void setLabelID(String labelID) { + this.labelID = labelID; + } + + @Override + public String toString() { + return labelID; + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof TreeNode)) + return false; + + TreeNode other = (TreeNode) o; + return this.labelID.equals(other.labelID); + } + + @Override + public int hashCode() { + return labelID.hashCode(); + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/hierarchy/UnorderedTree.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/hierarchy/UnorderedTree.java new file mode 100644 index 000000000..2816bf32a --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/hierarchy/UnorderedTree.java @@ -0,0 +1,696 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.hierarchy; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.HashSet; + +import com.google.common.collect.ImmutableSet; + +import edu.uci.ics.jung.graph.AbstractTypedGraph; +import edu.uci.ics.jung.graph.Tree; +import edu.uci.ics.jung.graph.util.EdgeType; +import edu.uci.ics.jung.graph.util.Pair; + +/** + * A Generic Tree Implementation, largely based on Jung's OrderedKAryTree (edu.uci.ics.jung.graph) + * -- adapted largely to discount the order of nodes, and to support arbitrary number of children -- + * allows some data to be associated with Edges, which was not immediately required but is kind of a + * forward planning for the project + * + * @author shashank + */ + +public class UnorderedTree extends + AbstractTypedGraph implements Tree { + + private static final long serialVersionUID = 1L; + + protected Map> edge_vpairs; + protected Map vertex_data; + protected int height; + protected V root; + + public UnorderedTree() { + super(EdgeType.DIRECTED); + this.height = -1; + this.edge_vpairs = new HashMap<>(); + this.vertex_data = new HashMap<>(); + } + + /** + * @param vertex the vertex whose number of children is to be returned + * @return the number of children that the {@code vertex} has + */ + @Override + public int getChildCount(V vertex) { + if (!containsVertex(vertex)) + return 0; + + Set edges = vertex_data.get(vertex).child_edges; + + if (edges == null) + return 0; + + return edges.size(); + } + + @Override + public Set getChildEdges(V vertex) { + if (!containsVertex(vertex)) + return null; + + Set edges = vertex_data.get(vertex).child_edges; + + if (edges == null) + return Collections.emptySet(); + else + return new ImmutableSet.Builder().addAll(edges).build(); + } + + /** + * Returns a set of vertex's child vertices. If the vertex has no children then an empty set + * will be returned. + */ + @Override + public Set getChildren(V vertex) { + if (!containsVertex(vertex)) + return null; + + Set edges = vertex_data.get(vertex).child_edges; + + if (edges == null) + return Collections.emptySet(); + + Set children = new HashSet(edges.size()); + + for (E edge : edges) + children.add(this.getOpposite(vertex, edge)); + + return new ImmutableSet.Builder().addAll(children).build(); + } + + /** + * @return the depth of the vertex in this tree, or -1 if the vertex is not present in this tree + */ + @Override + public int getDepth(V vertex) { + if (!containsVertex(vertex)) + return -1; + + return vertex_data.get(vertex).depth; + } + + /** + * Returns the height of the tree, or -1 if the tree is empty. + */ + @Override + public int getHeight() { + return height; + } + + @Override + public V getParent(V vertex) { + if (!containsVertex(vertex)) + return null; + else if (vertex.equals(root)) + return null; + + return edge_vpairs.get(vertex_data.get(vertex).parent_edge).getFirst(); + } + + @Override + public E getParentEdge(V vertex) { + if (!containsVertex(vertex)) + return null; + + return vertex_data.get(vertex).parent_edge; + } + + @Override + public V getRoot() { + return root; + } + + @Override + public Collection> getTrees() { + Collection> forest = new ArrayList<>(1); + forest.add(this); + + return forest; + } + + /** + * Adds the specified {@code child} vertex and edge {@code e} to the graph with the specified + * parent vertex {@code parent}. + * + * @param e the edge to add + * @param parent the source of the edge to be added + * @param child the destination of the edge to be added + * @return {@code true} if the graph has been modified + */ + @Override + public boolean addEdge(E e, V parent, V child) { + if (e == null || child == null || parent == null) + throw new IllegalArgumentException("Inputs must not be null"); + + if (!containsVertex(parent)) + throw new IllegalArgumentException("Tree must already include parent: " + parent); + + if (containsVertex(child)) + throw new IllegalArgumentException("Tree must not already include child: " + child); + + if (parent.equals(child)) + throw new IllegalArgumentException("Input vertices must be distinct"); + + Pair endpoints = new Pair<>(parent, child); + + if (containsEdge(e)) { + if (!endpoints.equals(edge_vpairs.get(e))) + throw new IllegalArgumentException("Tree already includes edge" + e + + " with different endpoints " + edge_vpairs.get(e)); + else + return false; + } + + VertexData parent_data = vertex_data.get(parent); + Set outedges = parent_data.child_edges; + + if (outedges == null) { + parent_data.child_edges = new HashSet<>(); + outedges = parent_data.child_edges; + } + + outedges.add(e); + + // initialize VertexData for child; leave child's child_edges null for now + VertexData child_data = new VertexData(e, parent_data.depth + 1); + vertex_data.put(child, child_data); + + height = child_data.depth > height ? child_data.depth : height; + edge_vpairs.put(e, endpoints); + + return true; + } + + @Override + public boolean addEdge(E e, V v1, V v2, EdgeType edge_type) { + this.validateEdgeType(edge_type); + + return addEdge(e, v1, v2); + } + + @Override + public boolean addEdge(E edge, Pair endpoints, EdgeType edgeType) { + if (edge == null || endpoints == null) + throw new IllegalArgumentException("inputs must not be null"); + + return addEdge(edge, endpoints.getFirst(), endpoints.getSecond(), edgeType); + } + + @Override + @SuppressWarnings("unchecked") + public boolean addEdge(E edge, Collection vertices, EdgeType edge_type) { + if (edge == null || vertices == null) + throw new IllegalArgumentException("inputs must not be null"); + + if (vertices.size() != 2) + throw new IllegalArgumentException("'vertices' must contain " + + "exactly 2 distinct vertices"); + + this.validateEdgeType(edge_type); + + Pair endpoints; + + if (vertices instanceof Pair) + endpoints = (Pair) vertices; + else + endpoints = new Pair<>(vertices); + + V v1 = endpoints.getFirst(); + V v2 = endpoints.getSecond(); + + if (v1.equals(v2)) + throw new IllegalArgumentException("Input vertices must be distinct"); + + return addEdge(edge, v1, v2); + } + + @Override + public boolean addVertex(V vertex) throws UnsupportedOperationException { + if (root == null) { + this.root = vertex; + vertex_data.put(vertex, new VertexData(null, 0)); + this.height = 0; + return true; + } + + else { + throw new UnsupportedOperationException("Unless you are setting " + + "the root, use addEdge() or addChild()"); + } + } + + @Override + public V getDest(E directed_edge) { + if (!containsEdge(directed_edge)) + return null; + + return edge_vpairs.get(directed_edge).getSecond(); + } + + @Override + public Pair getEndpoints(E edge) { + if (!containsEdge(edge)) + return null; + + return edge_vpairs.get(edge); + } + + @Override + public Set getInEdges(V vertex) { + if (!containsVertex(vertex)) + return null; + else if (vertex.equals(root)) + return Collections.emptySet(); + else + return Collections.singleton(getParentEdge(vertex)); + } + + @Override + public V getOpposite(V vertex, E edge) { + if (!containsVertex(vertex) || !containsEdge(edge)) + return null; + + Pair endpoints = edge_vpairs.get(edge); + V v1 = endpoints.getFirst(); + V v2 = endpoints.getSecond(); + + return v1.equals(vertex) ? v2 : v1; + } + + @Override + public Set getOutEdges(V vertex) { + return getChildEdges(vertex); + } + + /** + * @return 0 if vertex is the root, -1 if the vertex is not an element of this + * tree, and 1 otherwise + */ + @Override + public int getPredecessorCount(V vertex) { + if (!containsVertex(vertex)) + return -1; + + return vertex.equals(root) ? 0 : 1; + } + + /** + * @return Empty Set if the vertex is the root, null if the vertex is not an + * element of this tree, and the Parent wrapper in a set otherwise + */ + @Override + public Set getPredecessors(V vertex) { + if (!containsVertex(vertex)) + return null; + + if (vertex.equals(root)) + return Collections.emptySet(); + + return Collections.singleton(getParent(vertex)); + } + + @Override + public V getSource(E directed_edge) { + if (!containsEdge(directed_edge)) + return null; + + return edge_vpairs.get(directed_edge).getFirst(); + } + + @Override + public int getSuccessorCount(V vertex) { + return getChildCount(vertex); + } + + @Override + public Set getSuccessors(V vertex) { + return getChildren(vertex); + } + + @Override + public int inDegree(V vertex) { + if (!containsVertex(vertex)) + return 0; + + if (vertex.equals(root)) + return 0; + + return 1; + } + + @Override + public boolean isDest(V vertex, E edge) { + if (!containsEdge(edge) || !containsVertex(vertex)) + return false; + + return edge_vpairs.get(edge).getSecond().equals(vertex); + } + + /** + * Returns true if vertex is a leaf of this tree, i.e., if it has no + * children. + * + * @param vertex the vertex to be queried + */ + public boolean isLeaf(V vertex) { + if (!containsVertex(vertex)) + return false; + + return outDegree(vertex) == 0; + } + + /** + * Returns true iff v1 is the parent of v2. Note that if + * v2 is the root and v1 is null, this method returns + * true. + */ + @Override + public boolean isPredecessor(V v1, V v2) { + if (!containsVertex(v2)) + return false; + + return getParent(v2).equals(v1); + } + + /** + * Returns true if the vertex is the root of this tree + * + * @param vertex the vertex to be queried + */ + public boolean isRoot(V vertex) { + if (root == null) + return false; + + return root.equals(vertex); + } + + @Override + public boolean isSource(V vertex, E edge) { + if (!containsEdge(edge) || !containsVertex(vertex)) + return false; + + return edge_vpairs.get(edge).getFirst().equals(vertex); + } + + /** + * Returns true iff v1 is the child of v2. Note that if + * v2 is a leaf node and v1 is null, this method returns + * true. + */ + @Override + public boolean isSuccessor(V v1, V v2) { + if (!containsVertex(v2)) + return false; + + if (containsVertex(v1)) + return getParent(v1).equals(v2); + + return isLeaf(v2) && v1 == null; + } + + @Override + public int outDegree(V vertex) { + if (!containsVertex(vertex)) + return 0; + + Set out_edges = vertex_data.get(vertex).child_edges; + + if (out_edges == null) + return 0; + + return out_edges.size(); + } + + @Override + public boolean isIncident(V vertex, E edge) { + if (!containsVertex(vertex) || !containsEdge(edge)) + return false; + + return edge_vpairs.get(edge).contains(vertex); + } + + @Override + public boolean isNeighbor(V v1, V v2) { + if (!containsVertex(v1) || !containsVertex(v2)) + return false; + + return getNeighbors(v1).contains(v2); + } + + @Override + public boolean containsEdge(E edge) { + return edge_vpairs.containsKey(edge); + } + + @Override + public boolean containsVertex(V vertex) { + return vertex_data.containsKey(vertex); + } + + @Override + public E findEdge(V v1, V v2) { + if (!containsVertex(v1) || !containsVertex(v2)) + return null; + + VertexData v1_data = vertex_data.get(v1); + + if (edge_vpairs.get(v1_data.parent_edge).getFirst().equals(v2)) + return v1_data.parent_edge; + + Set edges = v1_data.child_edges; + + if (edges == null) + return null; + + for (E edge : edges) + if (edge_vpairs.get(edge).getSecond().equals(v2)) + return edge; + + return null; + } + + @Override + public Set findEdgeSet(V v1, V v2) { + E edge = findEdge(v1, v2); + + if (edge == null) + return Collections.emptySet(); + else + return Collections.singleton(edge); + } + + @Override + public int getEdgeCount() { + return edge_vpairs.size(); + } + + @Override + public Set getEdges() { + return new ImmutableSet.Builder().addAll(edge_vpairs.keySet()).build(); + } + + @Override + public int getIncidentCount(E edge) { + return 2; // all tree edges have 2 incident vertices + } + + public Set getIncidentEdges(V vertex) { + if (!containsVertex(vertex)) + return null; + + Set edges = new HashSet<>(); + VertexData v_data = vertex_data.get(vertex); + + if (v_data.parent_edge != null) + edges.add(v_data.parent_edge); + + if (v_data.child_edges != null) { + edges.addAll(v_data.child_edges); + } + + if (edges.isEmpty()) + return Collections.emptySet(); + + return new ImmutableSet.Builder().addAll(edges).build(); + } + + @Override + public Collection getIncidentVertices(E edge) { + return edge_vpairs.get(edge); + } + + @Override + public int getNeighborCount(V vertex) { + if (!containsVertex(vertex)) + return 0; + + return (vertex.equals(root) ? 0 : 1) + this.getChildCount(vertex); + } + + @Override + public Set getNeighbors(V vertex) { + if (!containsVertex(vertex)) + return null; + + Set vertices = new HashSet<>(); + VertexData v_data = vertex_data.get(vertex); + + if (v_data.parent_edge != null) + vertices.add(edge_vpairs.get(v_data.parent_edge).getFirst()); + + if (v_data.child_edges != null) { + for (E edge : v_data.child_edges) + vertices.add(edge_vpairs.get(edge).getSecond()); + } + + if (vertices.isEmpty()) + return Collections.emptySet(); + + return new ImmutableSet.Builder().addAll(vertices).build(); + } + + @Override + public int getVertexCount() { + return vertex_data.size(); + } + + @Override + public Set getVertices() { + return new ImmutableSet.Builder().addAll(vertex_data.keySet()).build(); + } + + @Override + public boolean removeEdge(E edge) { + if (!containsEdge(edge)) + return false; + + removeVertex(edge_vpairs.get(edge).getSecond()); + edge_vpairs.remove(edge); + + return true; + } + + @Override + public boolean removeVertex(V vertex) { + if (!containsVertex(vertex)) + return false; + + // recursively remove all of vertex's children + for (V v : getChildren(vertex)) + removeVertex(v); + + E parent_edge = getParentEdge(vertex); + edge_vpairs.remove(parent_edge); + + Set edges = vertex_data.get(vertex).child_edges; + + if (edges != null) + for (E edge : edges) + edge_vpairs.remove(edge); + + vertex_data.remove(vertex); + + return true; + } + + @SuppressWarnings("unchecked") + @Override + public boolean equals(Object o) { + if (!(o instanceof UnorderedTree)) + return false; + + UnorderedTree that = (UnorderedTree) o; + + if (this.edge_vpairs.size() != that.edge_vpairs.size()) + return false; + + for (E edge : this.edge_vpairs.keySet()) { + Pair thisPair = this.edge_vpairs.get(edge); + + if (!that.edge_vpairs.containsKey(edge)) + return false; + + Pair thatPair = that.edge_vpairs.get(edge); + + if (!thisPair.equals(thatPair)) + return false; + } + + if (this.vertex_data.size() != that.vertex_data.size()) + return false; + + for (V vertex : this.vertex_data.keySet()) { + VertexData thisData = this.vertex_data.get(vertex); + + if (!that.vertex_data.containsKey(vertex)) + return false; + + VertexData thatData = that.vertex_data.get(vertex); + + if (!thisData.equals(thatData)) + return false; + } + + if (this.height != that.height) + return false; + + if (!this.root.equals(that.root)) + return false; + + return true; + } + + protected class VertexData implements Serializable { + private static final long serialVersionUID = 1L; + + Set child_edges; + E parent_edge; + int depth; + + VertexData(E parent_edge, int depth) { + this.parent_edge = parent_edge; + this.depth = depth; + } + + public boolean equals(VertexData that) { + if (this.child_edges.size() != that.child_edges.size()) + return false; + + for (E edge : this.child_edges) { + if (!that.child_edges.contains(edge)) + return false; + } + + if (!this.parent_edge.equals(that.parent_edge)) + return false; + + if (this.depth != that.depth) + return false; + + return true; + } + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/representation/AEmbedding.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/representation/AEmbedding.java new file mode 100755 index 000000000..2a190be89 --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/representation/AEmbedding.java @@ -0,0 +1,169 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.representation; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import edu.illinois.cs.cogcomp.datalessclassification.util.QueryPreProcessor; +import edu.illinois.cs.cogcomp.datalessclassification.util.SparseVector; + +/** + * Abstract class for all Embeddings + * + * @author yqsong@illinois.edu + * @author shashank + */ + +public abstract class AEmbedding { + + /** + * Override this function to get your desired query segmentation (simpleSegmentation being one + * example) + */ + public List getTerms(String query) { + return simpleSegmentation(query); + } + + /** + * A simple white-space based tokenizer + */ + private List simpleSegmentation(String query) { + query = QueryPreProcessor.process(query); + String[] terms = query.split("\\s+"); + + List termList = Arrays.asList(terms); + + return termList; + } + + /** + * Override this function to get your desired term processing module (simpleProcessTerm being + * one example) + */ + public String processTerm(String s) { + return simpleProcessTerm(s); + } + + final private String simpleProcessTerm(String s) { + return s.toLowerCase().trim(); + } + + /** + * Implement this function for returning the default vector in case no word in the query could + * be found in the index + */ + abstract public SparseVector getDefaultConceptVectorMap(); + + /** + * Intended to output the vector for the most basic unit; Should internally process the term + * using the processTerm function + */ + abstract public SparseVector getTermConceptVectorMap(String term); + + /** + * Intended to output the vector for any arbitrary query; Should internally segment/tokenize the query, + * process it, and then return a composed vector. + * + */ + abstract public SparseVector getVector(String query); + + /** + * Override this function to select given number of dimensions + */ + public SparseVector getVector(String query, int numConcepts) { + return getVectorIgnoreSize(query); + } + + private SparseVector getVectorIgnoreSize(String query) { + return getVector(query); + } + + /** + * This function converts the given query into a list of terms using the getTerms() function, + * and returns a vector which is an average of the vectors of the individual terms. + */ + public SparseVector getConceptVectorBasedOnSegmentation(String query) { + return getConceptVectorBasedOnSegmentation(query, false); + } + + /** + * This function overloads getConceptVectorBasedOnSegmentation to provide support for + * switching on/off weighing the individual term vectors with their frequencies in the query. + * + * Setting ignoreTermFreq = True will return a simple averaging over all the terms in the query + */ + public SparseVector getConceptVectorBasedOnSegmentation(String query, boolean ignoreTermFreq) { + Map termWeights = new HashMap<>(); + List terms = getTerms(query); + + for (String term : terms) { + if (!termWeights.containsKey(term)) { + termWeights.put(term, 1.0); + } else { + if (!ignoreTermFreq) + termWeights.put(term, termWeights.get(term) + 1.0); + } + } + + return getConceptVectorBasedOnTermWeights(termWeights); + } + + /** + * This function takes a "Term - Count" map as input and outputs a vector which is the + * weighted average of the vectors of individual terms. + */ + public SparseVector getConceptVectorBasedOnTermWeights(Map termWeights) { + if (termWeights.size() == 0) + return new SparseVector<>(); + + Map finalMap = new HashMap<>(); + + double sumWeight = 0; + + for (String term : termWeights.keySet()) { + SparseVector vec = getTermConceptVectorMap(term); + + if (vec == null) + continue; + + if (vec.size() == 0) + continue; + + Map map = vec.getKeyValueMap(); + + for (T index : map.keySet()) { + double score = map.get(index); + + score *= termWeights.get(term); + + if (!finalMap.containsKey(index)) { + finalMap.put(index, score); + } else { + finalMap.put(index, finalMap.get(index) + score); + } + } + + sumWeight += termWeights.get(term); + } + + if (finalMap.isEmpty()) + return new SparseVector<>(); + + for (T k : finalMap.keySet()) { + finalMap.put(k, finalMap.get(k) / sumWeight); + } + + SparseVector sparseVector = new SparseVector<>(finalMap); + + return sparseVector; + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/representation/esa/MemoryBasedESA.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/representation/esa/MemoryBasedESA.java new file mode 100755 index 000000000..7679a9c36 --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/representation/esa/MemoryBasedESA.java @@ -0,0 +1,446 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.representation.esa; + +import java.io.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +import edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator; +import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager; +import edu.illinois.cs.cogcomp.datalessclassification.config.ESADatalessConfigurator; +import edu.illinois.cs.cogcomp.datalessclassification.representation.AEmbedding; +import edu.illinois.cs.cogcomp.datalessclassification.util.SparseVector; +import edu.illinois.cs.cogcomp.datalessclassification.util.SparseVectorOperations; +import org.cogcomp.Datastore; +import org.cogcomp.DatastoreException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Computes ESA Embedding for a query. + * Loads up all the required DataStructures in memory. + * + * @author yqsong@illinois.edu + * @author shashank + */ + +public class MemoryBasedESA extends AEmbedding { + private static Logger logger = LoggerFactory.getLogger(MemoryBasedESA.class); + + private static Map> vectors; + private static Map wordIDF; + private static Map pageIdTitleMapping; + + //TODO: Get default term constant from a config file + private static final String DEFAULT_TERM = "if"; + + private int dimensions; + + private enum FileType { + Embedding, Mapping + } + + public MemoryBasedESA() { + this(new ESADatalessConfigurator().getDefaultConfig()); + } + + public MemoryBasedESA(ResourceManager config) { + this(config.getInt(ESADatalessConfigurator.ESA_DIM)); + } + + public MemoryBasedESA(int embSize) { + dimensions = embSize; + } + + /** + * Loads up ESA embeddings lazily + */ + private void loadVectors() { + if (vectors == null) { + File inputFile = null; + + try { + inputFile = getFile(FileType.Embedding); + } catch (DatastoreException e) { + e.printStackTrace(); + logger.error("Error obtaining embeddings file from Datastore"); + throw new RuntimeException("Error obtaining embeddings file from Datastore"); + } + + try(BufferedReader reader = new BufferedReader(new FileReader(inputFile))) { + logger.info("Reading ESA Embeddings from " + inputFile.getAbsolutePath()); + + vectors = new HashMap<>(); + wordIDF = new HashMap<>(); + + int count = 0; + + String line; + + while ((line = reader.readLine()) != null) { + line = line.trim(); + + if (line.length() > 0) { + String[] arr = line.split("\t"); + + String word = arr[0]; + + double idf = Double.parseDouble(arr[1]); + wordIDF.put(word, idf); + + String[] conceptValues = arr[2].split(";"); + + Map map = new HashMap<>(); + + for (String conceptValue : conceptValues) { + String[] tokens = conceptValue.split(","); + map.put(Integer.parseInt(tokens[0]), Double.parseDouble(tokens[1])); + } + + SparseVector sparseVector = new SparseVector<>(map); + vectors.put(word, sparseVector); + } + + count++; + + if (count % 100000 == 0) + logger.info("#ESA embeddings read: " + count); + } + + logger.info("Done."); + } catch (FileNotFoundException e) { + e.printStackTrace(); + logger.error("ESA embedding file not found at " + inputFile); + throw new RuntimeException("ESA embedding file not found at " + inputFile); + } catch (IOException e) { + e.printStackTrace(); + logger.error("IO Error while reading the ESA Embeddings"); + throw new RuntimeException("IO Error while reading the ESA Embeddings"); + } + } + } + + /** + * Loads up ESA's "ID - Title" mapping lazily + */ + private void loadIdTitleMap() { + if (pageIdTitleMapping == null) { + File mappingFile = null; + + try { + mappingFile = getFile(FileType.Mapping); + } catch (DatastoreException e) { + e.printStackTrace(); + logger.error("Error obtaining Name-mapping file from Datastore"); + throw new RuntimeException("Error obtaining Name-mapping file from Datastore"); + } + + try(BufferedReader bf = new BufferedReader(new FileReader(mappingFile))) { + logger.info("Reading mapping file: " + mappingFile.getAbsolutePath()); + + pageIdTitleMapping = new HashMap<>(); + + String line; + + while ((line = bf.readLine()) != null) { + if (line.length() == 0) + continue; + + String[] tokens = line.split("\t"); + + if (tokens.length != 2) + continue; + + Integer id = Integer.parseInt(tokens[0].trim()); + + if (!pageIdTitleMapping.containsKey(id)) { + pageIdTitleMapping.put(id, tokens[1]); + } + } + + logger.info("Done."); + } catch (FileNotFoundException e) { + e.printStackTrace(); + logger.error("Name-mapping file not found"); + throw new RuntimeException("Name-mapping file not found"); + } catch (IOException e) { + e.printStackTrace(); + logger.error("IO Error while reading the name-mapping file"); + throw new RuntimeException("IO Error while reading the name-mapping file"); + } + } + } + + /** + * Retrieves the relevant file from the DataStore + */ + private File getFile(FileType type) throws DatastoreException { + ResourceManager rm = new ResourceConfigurator().getDefaultConfig(); + Datastore ds = new Datastore(rm.getString("datastoreEndpoint")); + + if (type.equals(FileType.Embedding)) { + File f = ds.getFile("org.cogcomp.dataless", "memorybasedESA.txt", 1.0); + return f; + } else { + File f = ds.getFile("org.cogcomp.dataless", "pageIDMapping.txt", 1.0); + return f; + } + } + + /** + * - Returns the vector where the conceptIDs are replaced with conceptNames + * - Allows for greater readibility of the vector + */ + public SparseVector retrieveConceptNames(String query) { + return retrieveConceptNames(query, dimensions); + } + + /** + * Overloads retrieveConceptNames to allow support for retrieving a fixed number of dimensions. + */ + public SparseVector retrieveConceptNames(String query, int numConcepts) { + SparseVector vectorTopic = getVector(query, numConcepts); + return retrieveConceptNames(vectorTopic); + } + + /** + * Converts the ConceptID vector to the corresponding ConceptName vector + */ + public SparseVector retrieveConceptNames(SparseVector originalVector) { + Map map = originalVector.getKeyValueMap(); + + SparseVector sparseVector = new SparseVector<>(); + Map outMap = new LinkedHashMap<>(); + + for (Integer key : map.keySet()) { + String concept = getConceptFromID(key); + + if (concept != null) + outMap.put(concept, map.get(key)); + } + + sparseVector.setVector(outMap); + return sparseVector; + } + + @Override + public SparseVector getVector(String query) { + return getVector(query, dimensions); + } + + @Override + public SparseVector getVector(String query, int numConcepts) { + return getConceptVectorBasedOnSegmentation(query, numConcepts); + } + + @Override + public SparseVector getConceptVectorBasedOnSegmentation(String query) { + return getConceptVectorBasedOnSegmentation(query, dimensions); + } + + /** + * This function overloads getConceptVectorBasedOnSegmentation to provide support for + * limiting the number of dimensions + */ + public SparseVector getConceptVectorBasedOnSegmentation(String query, int numConcepts) { + loadVectors(); + + Map tfidfMap = new HashMap<>(); + List terms = getTerms(query); + + if (terms.size() == 0) + return new SparseVector<>(); + + for (String term : terms) { + if (!tfidfMap.containsKey(term)) { + tfidfMap.put(term, 1.0); + } else { + tfidfMap.put(term, tfidfMap.get(term) + 1); + } + } + + double vsum = 0; + double norm; + + for (String strTerm : tfidfMap.keySet()) { + double tf = tfidfMap.get(strTerm); + + tf = 1 + Math.log(tf); + + if (wordIDF.containsKey(strTerm)) { + double tfidf = wordIDF.get(strTerm) * tf; + + vsum += tfidf * tfidf; + + tfidfMap.put(strTerm, tfidf); + } + } + + norm = Math.sqrt(vsum); + + for (String strTerm : tfidfMap.keySet()) { + double tfidf = tfidfMap.get(strTerm); + tfidfMap.put(strTerm, tfidf / norm); + } + + return getConceptVectorBasedonTermWeights(tfidfMap, numConcepts); + } + + @Override + public SparseVector getConceptVectorBasedOnTermWeights(Map termWeights) { + return getConceptVectorBasedonTermWeights(termWeights, dimensions); + } + + /** + * This function overloads getConceptVectorBasedOnTermWeights to provide support for + * limiting the number of dimensions + */ + public SparseVector getConceptVectorBasedonTermWeights( + Map termWeights, int numConcepts) { + if (termWeights.size() == 0) + return new SparseVector<>(); + + List> conceptMapList = new ArrayList<>(); + List weightList = new ArrayList<>(); + + for (String strTerm : termWeights.keySet()) { + SparseVector sparseVector = getTermConceptVectorMap(strTerm, numConcepts); + + if ((sparseVector.size() > 0) && (termWeights.get(strTerm) > 0)) { + conceptMapList.add(sparseVector.getKeyValueMap()); + weightList.add(termWeights.get(strTerm)); + } + } + + Map conceptMap; + + // TODO: No normalization by the sum of the weights? + + conceptMap = SparseVectorOperations.addMultipleMaps(conceptMapList, weightList); + + SparseVector vec = new SparseVector<>(conceptMap); + + SparseVector sortedVec = + SparseVector.getOrderedSparseVector(vec, SparseVector.decreasingScores(), + numConcepts); + + // Normalization by the length of the document/terms + sortedVec.scaleAll(1.0 / weightList.size()); + + return sortedVec; + } + + + @Override + public SparseVector getDefaultConceptVectorMap() { + return getDefaultConceptVectorMap(dimensions); + } + + /** + * This function overloads getDefaultConceptVectorMap to provide support for + * limiting the number of dimensions + */ + public SparseVector getDefaultConceptVectorMap(int numConcepts) { + loadVectors(); + + return getTermConceptVectorMap(DEFAULT_TERM, numConcepts); + } + + @Override + public SparseVector getTermConceptVectorMap(String term) { + return getTermConceptVectorMap(term, dimensions); + } + + /** + * This function overloads getTermConceptVectorMap to provide support for + * limiting the number of dimensions + */ + public SparseVector getTermConceptVectorMap(String term, int numConcepts) { + loadVectors(); + + SparseVector vector = new SparseVector<>(); + + term = processTerm(term); + + if (vectors.containsKey(term)) + vector = vectors.get(term); + + SparseVector sortedVec = + SparseVector.getOrderedSparseVector(vector, + SparseVector.decreasingScores(), numConcepts); + + return sortedVec; + } + + /** + * Returns the ConceptName from the ConceptID + */ + public String getConceptFromID(Integer id) { + if (pageIdTitleMapping == null) + loadIdTitleMap(); + + String conceptName = null; + + if (pageIdTitleMapping.containsKey(id)) + conceptName = + pageIdTitleMapping.get(id).replaceAll(",", "").replaceAll(";", "") + .replaceAll("\t", ""); + + return conceptName; + } + + + public static void main(String[] args) { + String sampleFile = "sampleDocument.txt"; + + if (args.length > 0) { + sampleFile = args[0]; + } + + try(BufferedReader br = new BufferedReader(new FileReader(new File(sampleFile)))) { + StringBuilder sb = new StringBuilder(); + + String line; + + while ((line = br.readLine()) != null) { + sb.append(line); + sb.append(" "); + } + + String text = sb.toString().trim(); + + MemoryBasedESA esa = new MemoryBasedESA(); + + SparseVector vector = esa.getVector(text); + Map vectorMap = vector.getKeyValueMap(); + + for (Integer key : vectorMap.keySet()) + System.out.print(key + "," + vectorMap.get(key) + ";"); + + System.out.println(); + System.out.println("Corresponding Concepts:"); + + SparseVector vectorTopic = esa.retrieveConceptNames(vector); + Map vectorTopicMap = vectorTopic.getKeyValueMap(); + + for (String key : vectorTopicMap.keySet()) + System.out.print(key + "," + vectorTopicMap.get(key) + ";"); + + System.out.println(); + + } catch (IOException e) { + e.printStackTrace(); + logger.error("IO Error while reading the test file"); + throw new RuntimeException("IO Error while reading the test file"); + } + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/representation/w2v/MemoryBasedW2V.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/representation/w2v/MemoryBasedW2V.java new file mode 100755 index 000000000..3ee0ce08c --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/representation/w2v/MemoryBasedW2V.java @@ -0,0 +1,265 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.representation.w2v; + +import java.io.*; +import java.util.HashMap; +import java.util.Map; + +import edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator; +import org.cogcomp.Datastore; +import org.cogcomp.DatastoreException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager; +import edu.illinois.cs.cogcomp.datalessclassification.config.W2VDatalessConfigurator; +import edu.illinois.cs.cogcomp.datalessclassification.representation.AEmbedding; +import edu.illinois.cs.cogcomp.datalessclassification.util.DenseVector; +import edu.illinois.cs.cogcomp.datalessclassification.util.DenseVectorOperations; +import edu.illinois.cs.cogcomp.datalessclassification.util.SparseVector; + +/** + * Computes Word2Vec Embedding for a query + * Loads up all the required DataStructures in memory + * + * @author yqsong@illinois.edu + * @author shashank + */ + +public class MemoryBasedW2V extends AEmbedding { + private static Logger logger = LoggerFactory.getLogger(MemoryBasedW2V.class); + + private static Map vectors; + + private int dimensions; + + //TODO: Get default term constant from a config file + private static final String DEFAULT_TERM = "auto"; + + public MemoryBasedW2V() { + this(new W2VDatalessConfigurator().getDefaultConfig()); + } + + public MemoryBasedW2V(ResourceManager config) { + this(config.getInt(W2VDatalessConfigurator.W2V_DIM)); + } + + public MemoryBasedW2V(int embSize) { + dimensions = embSize; + } + + /** + * Loads up Word2Vec embeddings lazily + */ + private void loadVectors() { + if (vectors == null) { + File inputFile = null; + try { + inputFile = getFile(); + } catch (DatastoreException e) { + e.printStackTrace(); + logger.error("Error retrieving the embedding file from DataStore"); + throw new RuntimeException("Error retrieving the embedding file from DataStore"); + } + + try(BufferedReader bf = new BufferedReader(new FileReader(inputFile))) { + logger.info("Reading Word2vec Embeddings from " + inputFile.getAbsolutePath()); + vectors = new HashMap<>(); + + String line = bf.readLine(); + String[] tokens = line.split(" "); + + // The first line has the following schema --> #Terms #Vector_Dimensions + int dimNum = Integer.parseInt(tokens[1].trim()); + + if (dimNum != dimensions) { + bf.close(); + throw new IllegalStateException("Number of dimensions in the embeddings file (" + dimNum + + ") don't match the one in the config file (" + dimensions + ")"); + } + + int count = 0; + + while ((line = bf.readLine()) != null) { + line = line.trim(); + + if (line.length() == 0) + continue; + + tokens = line.trim().split(" ", 2); + String[] stringVec = tokens[1].split(" "); + + if (stringVec.length != dimNum) { + bf.close(); + throw new IllegalStateException( + "Possible Error in the embeddings file -- number of dimensions(" + + dimNum + ") don't match -->" + tokens[1]); + } + + String word = tokens[0].trim(); + if (word.length() == 0) + continue; + + double[] scores = new double[dimNum]; + + int i = 0; + for (String dim : stringVec) { + scores[i] = Double.parseDouble(dim); + i++; + } + + DenseVector vec = new DenseVector(scores); + vectors.put(word, vec); + + count++; + + if (count % 100000 == 0) + logger.info("#W2V embeddings read: " + count); + + } + } catch (IOException e) { + e.printStackTrace(); + logger.error("IO Error while reading the W2V Embedding File"); + throw new RuntimeException("IO Error while reading the W2V Embedding File"); + } catch (IllegalStateException e) { + e.printStackTrace(); + logger.error(e.getMessage()); + throw new RuntimeException(e.getMessage()); + } + } + } + + /** + * Retrieves the relevant file from the DataStore + */ + private File getFile() throws DatastoreException { + ResourceManager rm = new ResourceConfigurator().getDefaultConfig(); + Datastore ds = new Datastore(rm.getString("datastoreEndpoint")); + + File f = ds.getFile("org.cogcomp.dataless", "word2vec.txt", 1.0); + return f; + } + + @Override + public SparseVector getTermConceptVectorMap(String term) { + loadVectors(); + + SparseVector vector = new SparseVector<>(); + + term = processTerm(term); + + if (vectors.containsKey(term)) + vector = DenseVectorOperations.getSparseVector(vectors.get(term));; + + return vector; + } + + @Override + public SparseVector getDefaultConceptVectorMap() { + loadVectors(); + + return getTermConceptVectorMap(DEFAULT_TERM); + } + + /** + * Returns a DenseVector for the SparseVector obtained from getDefaultConceptVectorMap + */ + public DenseVector getDefaultDenseTermVector() { + SparseVector conceptMap = getDefaultConceptVectorMap(); + DenseVector vec = DenseVector.createDenseVector(conceptMap); + + return vec; + } + + /** + * Returns a DenseVector for the SparseVector obtained from getTermConceptVectorMap + */ + public DenseVector getDenseTermVector(String term) { + SparseVector conceptMap = getTermConceptVectorMap(term); + DenseVector vec = DenseVector.createDenseVector(conceptMap); + + return vec; + } + + @Override + public SparseVector getVector(String query) { + return getConceptVectorBasedOnSegmentation(query); + } + + /** + * Returns a DenseVector for the SparseVector obtained from getConceptVectorBasedOnSegmentation + */ + public DenseVector getDenseVectorBasedOnSegmentation(String query) { + return getDenseVectorBasedOnSegmentation(query, false); + } + + /** + * Overloads getDenseVectorBasedOnSegmentation to provide support for switching on/off + * term frequency weighting while composing the term vectors + */ + public DenseVector getDenseVectorBasedOnSegmentation(String query, boolean ignoreTermFreq) { + SparseVector conceptMap = + getConceptVectorBasedOnSegmentation(query, ignoreTermFreq); + DenseVector vec = DenseVector.createDenseVector(conceptMap); + + return vec; + } + + /** + * Returns a DenseVector for the SparseVector obtained from getConceptVectorBasedOnTermWeights + */ + public DenseVector getDenseVectorBasedOnTermWeights(HashMap termWeights) { + SparseVector conceptMap = getConceptVectorBasedOnTermWeights(termWeights); + DenseVector vec = DenseVector.createDenseVector(conceptMap); + + return vec; + } + + public static void main(String[] args) { + String sampleFile = "sampleDocument.txt"; + + if (args.length > 0) { + sampleFile = args[0]; + } + + try(BufferedReader br = new BufferedReader(new FileReader(new File(sampleFile)))) { + StringBuilder sb = new StringBuilder(); + + String line; + + while ((line = br.readLine()) != null) { + sb.append(line); + sb.append(" "); + } + + br.close(); + + String text = sb.toString().trim(); + + MemoryBasedW2V embedding = new MemoryBasedW2V(); + + SparseVector vector = embedding.getVector(text); + Map vectorMap = vector.getKeyValueMap(); + + for (Integer key : vectorMap.keySet()) + System.out.print(key + "," + vectorMap.get(key) + ";"); + + System.out.println(); + + } catch (FileNotFoundException e) { + e.printStackTrace(); + logger.error("Test File not found at " + sampleFile); + throw new RuntimeException("Test File not found at " + sampleFile); + } catch (IOException e) { + e.printStackTrace(); + logger.error("IO Error while reading the test file"); + throw new RuntimeException("IO Error while reading the test file"); + } + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/ta/ADatalessAnnotator.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/ta/ADatalessAnnotator.java new file mode 100755 index 000000000..1ed70d5a0 --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/ta/ADatalessAnnotator.java @@ -0,0 +1,210 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.ta; + +import java.util.List; +import java.util.Map; +import java.util.Set; + +import edu.illinois.cs.cogcomp.annotation.Annotator; +import org.apache.commons.lang.NotImplementedException; +import org.json.simple.JSONObject; + +import edu.illinois.cs.cogcomp.annotation.AnnotatorException; +import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; +import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager; +import edu.illinois.cs.cogcomp.datalessclassification.classifier.ConceptTree; +import edu.illinois.cs.cogcomp.datalessclassification.classifier.DatalessClassifierML; +import edu.illinois.cs.cogcomp.datalessclassification.classifier.LabelTree; +import edu.illinois.cs.cogcomp.datalessclassification.config.DatalessConfigurator; +import edu.illinois.cs.cogcomp.datalessclassification.representation.AEmbedding; +import edu.illinois.cs.cogcomp.datalessclassification.util.SparseVector; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Abstract class for all Dataless Annotators + * + * @author shashank + */ + +public abstract class ADatalessAnnotator extends Annotator { + private static Logger logger = LoggerFactory.getLogger(ADatalessAnnotator.class); + + protected int embedding_dim; + protected int topK; + + protected AEmbedding embedding; + protected DatalessClassifierML classifier; + + protected LabelTree labelTree; + protected ConceptTree conceptTree; + + protected Map conceptWeights; + + /** + * Use this constructor with "isLazilyInitialized = True" to use your custom annotator + * initializer + */ + protected ADatalessAnnotator(String viewName, boolean isLazilyInitialized) { + super(viewName, new String[] {ViewNames.TOKENS}, isLazilyInitialized); + } + + protected ADatalessAnnotator(String viewName, ResourceManager config) { + this(viewName, new String[] {ViewNames.TOKENS}, config); + } + + protected ADatalessAnnotator(String viewName, String[] requiredViews, ResourceManager config) { + this(viewName, requiredViews, false, config); + } + + protected ADatalessAnnotator(String viewName, String[] requiredViews, + boolean isLazilyInitialized, ResourceManager config) { + super(viewName, requiredViews, isLazilyInitialized, config); + } + + protected abstract String getClassName(); + + @Override + public void initialize(ResourceManager rm) { + String hierarchyFile = rm.getString(DatalessConfigurator.LabelHierarchy_Path); + String labelNameFile = rm.getString(DatalessConfigurator.LabelName_Path); + String labelDescFile = rm.getString(DatalessConfigurator.LabelDesc_Path); + + logger.info("Initializing LabelTree..."); + initializeLabelTree(hierarchyFile, labelNameFile, labelDescFile); + logger.info("LabelTree Initialization Done."); + + logger.info("Initializing Embedding..."); + initializeEmbedding(rm); + logger.info("Embedding Initialization Done."); + + logger.info("Initializing Classifier..."); + initializeClassifier(rm); + logger.info("Classifier Initialization Done."); + } + + /** + * Initializes the LabelTree from the JSON representation of the Label Hierarchy + * NOTE: This support is yet to come + * + * @throws NotImplementedException + */ + protected void initializeLabelTree(JSONObject jsonHierarchy) throws NotImplementedException { + // TODO: Start supporting JSON format for hierarchy input + throw new NotImplementedException("JSON support coming soon.."); + } + + /** + * Initializes the LabelTree from the mapping files + */ + protected void initializeLabelTree(String hierarchyFile, String labelNameFile, + String labelDescFile) { + Set topNodes = DatalessAnnotatorUtils.getTopNodes(hierarchyFile); + Map> childMap = DatalessAnnotatorUtils.getParentChildMap(hierarchyFile); + Map labelNameMap = DatalessAnnotatorUtils.getLabelNameMap(labelNameFile); + Map labelDescMap = + DatalessAnnotatorUtils.getLabelDescriptionMap(labelDescFile); + + initializeLabelTree(topNodes, childMap, labelNameMap, labelDescMap); + } + + /** + * Initializes the LabelTree structure, given the: + * @param topNodes: Set containing the labelIDs of the top-level nodes in the tree + * @param childMap: Map containing the parentID - childIds mapping + * @param labelNameMap: Map containing the labelID - labelName mapping + * @param labelDescMap: Map containing the labelID - labelDescription mapping + */ + protected void initializeLabelTree(Set topNodes, Map> childMap, + Map labelNameMap, Map labelDescMap) { + + labelTree = new LabelTree(); + + initializeTreeStructure(topNodes, childMap); + initializeLabelNames(labelNameMap); + initializeLabelDescriptions(labelDescMap); + } + + /** + * Initializes the LabelTree structure given the: + * @param + */ + private void initializeTreeStructure(Set topNodes, Map> childMap) { + labelTree.initializeTreeStructure(topNodes, childMap); + } + + /** + * Initializes the LabelNames + */ + private void initializeLabelNames(Map labelNameMap) { + labelTree.initializeLabelNames(labelNameMap); + } + + /** + * Initializes the LabelDescriptions + */ + private void initializeLabelDescriptions(Map labelDesriptionMap) { + labelTree.initializeLabelDescriptions(labelDesriptionMap); + } + + /** + * - initialize the embedding, embedding_dim and (optionally) conceptWeights objects here + * - call this before calling initializeClassifier() + */ + protected abstract void initializeEmbedding(ResourceManager config); + + /** + * - Call this before trying to annotate the objects + * - Call this only after calling initializeEmbedding + */ + protected void initializeClassifier(ResourceManager config) { + initializeConceptTree(); + topK = config.getInt(DatalessConfigurator.topK); + classifier = new DatalessClassifierML<>(config, conceptTree); + } + + /** + * Initializes the ConceptTree + */ + protected void initializeConceptTree() { + conceptTree = new ConceptTree<>(labelTree, embedding, conceptWeights, embedding_dim); + } + + @Override + protected void addView(TextAnnotation ta) throws AnnotatorException { + SpanLabelView datalessView = new SpanLabelView(getViewName(), getClassName(), ta, 1d, true); + + List tokens = ta.getView(ViewNames.TOKENS).getConstituents(); + + int numTokens = tokens.size(); + + int textStart = tokens.get(0).getSpan().getFirst(); + int textEnd = tokens.get(numTokens - 1).getSpan().getSecond(); + + StringBuilder sb = new StringBuilder(); + + for (String s : ta.getTokensInSpan(textStart, textEnd)) { + sb.append(s); + sb.append(" "); + } + + SparseVector docVector = embedding.getVector(sb.toString().trim()); + + Set labelIDs = classifier.getFlatPredictions(docVector, topK); + + for (String labelID : labelIDs) { + datalessView.addSpanLabel(textStart, textEnd, labelID, 1d); + } + + ta.addView(getViewName(), datalessView); + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/ta/DatalessAnnotatorUtils.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/ta/DatalessAnnotatorUtils.java new file mode 100644 index 000000000..2a672b4a8 --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/ta/DatalessAnnotatorUtils.java @@ -0,0 +1,148 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.ta; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.*; + +/** + * Collection of utility functions + * + * @author yqsong@illinois.edu + * @author shashank + */ + +class DatalessAnnotatorUtils { + + private static Logger logger = LoggerFactory.getLogger(DatalessAnnotatorUtils.class); + + /** + * Reads the "labelID - labelName" mapping from the file + */ + static Map getLabelNameMap(String filePath) { + Map labelNameMap = new HashMap<>(); + + try(BufferedReader reader = new BufferedReader(new FileReader(filePath))) { + String line; + + while ((line = reader.readLine()) != null) { + if (line.isEmpty()) + continue; + + String[] tokens = line.split("\t", 2); + + String labelId = tokens[0].trim(); + String labelName = tokens[1].trim(); + + labelNameMap.put(labelId, labelName); + } + + } catch (IOException e) { + e.printStackTrace(); + logger.error("IO Error while reading the file at " + filePath); + throw new RuntimeException("IO Error while reading the file at " + filePath); + } + + return labelNameMap; + } + + /** + * Reads the "labelID - labelDescription" mapping from the file + */ + static Map getLabelDescriptionMap(String filePath) { + Map labelDesriptionMap = new HashMap<>(); + + try(BufferedReader reader = new BufferedReader(new FileReader(filePath))) { + String line; + + while ((line = reader.readLine()) != null) { + if (line.isEmpty()) + continue; + + String[] tokens = line.split("\t", 2); + + String labelId = tokens[0].trim(); + String labelDesc = tokens[1].trim(); + + labelDesriptionMap.put(labelId, labelDesc); + } + + } catch (IOException e) { + e.printStackTrace(); + logger.error("IO Error while reading the file at " + filePath); + throw new RuntimeException("IO Error while reading the file at " + filePath); + } + + return labelDesriptionMap; + } + + /** + * Reads the top-level nodes from the hierarchy file + */ + static Set getTopNodes(String hierarchyPath) { + Set topNodes = new HashSet<>(); + + try(BufferedReader reader = new BufferedReader(new FileReader(hierarchyPath))) { + String line = reader.readLine(); + + String[] nodes = line.split("\t"); + + topNodes.addAll(Arrays.asList(nodes)); + + } catch (IOException e) { + e.printStackTrace(); + logger.error("IO Error while reading the file at " + hierarchyPath); + throw new RuntimeException("IO Error while reading the file at " + hierarchyPath); + } + + return topNodes; + } + + /** + * Reads the "parentNode - childNodes" mapping from the hierarchy file + */ + static Map> getParentChildMap(String hierarchyPath) { + Map> parentChildMap = new HashMap<>(); + + try(BufferedReader reader = new BufferedReader(new FileReader(hierarchyPath))) { + reader.readLine(); + + String line; + String[] nodes; + + while ((line = reader.readLine()) != null) { + if (line.isEmpty()) + continue; + + String[] tokens = line.split("\t", 2); + + String parentID = tokens[0].trim(); + String childIDs = tokens[1].trim(); + Set childIDSet = new HashSet<>(); + + nodes = childIDs.split("\t"); + + childIDSet.addAll(Arrays.asList(nodes)); + + parentChildMap.put(parentID, childIDSet); + } + + } catch (IOException e) { + e.printStackTrace(); + logger.error("IO Error while reading the file at " + hierarchyPath); + throw new RuntimeException("IO Error while reading the file at " + hierarchyPath); + } + + return parentChildMap; + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/ta/ESADatalessAnnotator.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/ta/ESADatalessAnnotator.java new file mode 100644 index 000000000..dc7cc88bd --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/ta/ESADatalessAnnotator.java @@ -0,0 +1,225 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.ta; + +import java.io.*; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.lang.NotImplementedException; +import org.json.simple.JSONObject; + +import edu.illinois.cs.cogcomp.annotation.AnnotatorException; +import edu.illinois.cs.cogcomp.datalessclassification.config.DatalessConfigurator; +import edu.illinois.cs.cogcomp.datalessclassification.config.ESADatalessConfigurator; +import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; +import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager; +import edu.illinois.cs.cogcomp.datalessclassification.representation.esa.MemoryBasedESA; +import edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer; +import edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A wrapper of ESA-based Dataless Classifier for the cogcomp pipeline. + * + * @author shashank + */ + +public class ESADatalessAnnotator extends ADatalessAnnotator { + private static final String NAME = ESADatalessAnnotator.class.getCanonicalName(); + private static Logger logger = LoggerFactory.getLogger(ESADatalessAnnotator.class); + + public ESADatalessAnnotator() { + this(new ESADatalessConfigurator().getDefaultConfig()); + } + + public ESADatalessAnnotator(ResourceManager config) { + super(ViewNames.DATALESS_ESA, config); + } + + public ESADatalessAnnotator(ResourceManager config, JSONObject jsonHierarchy) + throws NotImplementedException { + super(ViewNames.DATALESS_ESA, true); + logger.info("Initializing LabelTree..."); + initializeLabelTree(jsonHierarchy); + logger.info("LabelTree Initialization Done."); + + logger.info("Initializing Embedding..."); + initializeEmbedding(config); + logger.info("Embedding Initialization Done."); + + logger.info("Initializing Classifier..."); + initializeClassifier(config); + logger.info("Classifier Initialization Done."); + + isInitialized = true; + } + + public ESADatalessAnnotator(ResourceManager config, String hierarchyPath, String labelNameFile, + String labelDescFile) { + super(ViewNames.DATALESS_ESA, true); + logger.info("Initializing LabelTree..."); + initializeLabelTree(hierarchyPath, labelNameFile, labelDescFile); + logger.info("LabelTree Initialization Done."); + + logger.info("Initializing Embedding..."); + initializeEmbedding(config); + logger.info("Embedding Initialization Done."); + + logger.info("Initializing Classifier..."); + initializeClassifier(config); + logger.info("Classifier Initialization Done."); + + isInitialized = true; + } + + public ESADatalessAnnotator(ResourceManager config, Set topNodes, + Map> childMap, Map labelNameMap, + Map labelDescMap) { + super(ViewNames.DATALESS_ESA, true); + logger.info("Initializing LabelTree..."); + initializeLabelTree(topNodes, childMap, labelNameMap, labelDescMap); + logger.info("LabelTree Initialization Done."); + + logger.info("Initializing Embedding..."); + initializeEmbedding(config); + logger.info("Embedding Initialization Done."); + + logger.info("Initializing Classifier..."); + initializeClassifier(config); + logger.info("Classifier Initialization Done."); + + isInitialized = true; + } + + /** + * Initializes the ESA Embedding that will be used for computing the representations + */ + protected void initializeEmbedding(ResourceManager config) { + conceptWeights = new HashMap<>(); + embedding_dim = config.getInt(ESADatalessConfigurator.ESA_DIM); + embedding = new MemoryBasedESA(config); + } + + @Override + protected String getClassName() { + return NAME; + } + + public static CommandLine getCMDOpts(String[] args) { + Options options = new Options(); + + Option configOpt = new Option("c", "config", true, "config file path"); + configOpt.setRequired(false); + options.addOption(configOpt); + + Option testFileOption = + new Option("f", "testFile", true, "File to annotate using Dataless"); + testFileOption.setRequired(false); + options.addOption(testFileOption); + + CommandLineParser parser = new DefaultParser(); + HelpFormatter formatter = new HelpFormatter(); + + CommandLine cmd = null; + + try { + cmd = parser.parse(options, args); + } catch (ParseException e) { + System.out.println(e.getMessage()); + formatter.printHelp("utility-name", options); + + System.exit(1); + return cmd; + } + + return cmd; + } + + + /** + * @param args config: config file path testFile: Test File + */ + public static void main(String[] args) { + CommandLine cmd = getCMDOpts(args); + + ResourceManager rm; + + try { + String configFile = cmd.getOptionValue("config", "config/project.properties"); + ResourceManager nonDefaultRm = new ResourceManager(configFile); + + rm = new ESADatalessConfigurator().getConfig(nonDefaultRm); + } catch (IOException e) { + rm = new ESADatalessConfigurator().getDefaultConfig(); + } + + String testFile = cmd.getOptionValue("testFile", "data/graphicsTestDocument.txt"); + + StringBuilder sb = new StringBuilder(); + + String line; + + try(BufferedReader br = new BufferedReader(new FileReader(new File(testFile)))) { + while ((line = br.readLine()) != null) { + sb.append(line); + sb.append(" "); + } + + String text = sb.toString().trim(); + + TokenizerTextAnnotationBuilder taBuilder = + new TokenizerTextAnnotationBuilder(new StatefulTokenizer()); + TextAnnotation ta = taBuilder.createTextAnnotation(text); + + ESADatalessAnnotator datalessAnnotator = new ESADatalessAnnotator(rm); + datalessAnnotator.addView(ta); + + List annots = ta.getView(ViewNames.DATALESS_ESA).getConstituents(); + + System.out.println("Predicted LabelIDs:"); + for (Constituent annot : annots) { + System.out.println(annot.getLabel()); + } + + Map labelNameMap = + DatalessAnnotatorUtils.getLabelNameMap(rm + .getString(DatalessConfigurator.LabelName_Path.key)); + + System.out.println("Predicted Labels:"); + + for (Constituent annot : annots) { + System.out.println(labelNameMap.get(annot.getLabel())); + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + logger.error("Test File not found at " + testFile + " ... exiting"); + System.exit(-1); + } catch (IOException e) { + e.printStackTrace(); + logger.error("IO Error while reading the test file ... exiting"); + System.exit(-1); + } catch (AnnotatorException e) { + e.printStackTrace(); + logger.error("Error Annotating the Test Document with the Dataless View ... exiting"); + System.exit(-1); + } + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/ta/W2VDatalessAnnotator.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/ta/W2VDatalessAnnotator.java new file mode 100644 index 000000000..f2ea3e575 --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/ta/W2VDatalessAnnotator.java @@ -0,0 +1,189 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.ta; + +import java.io.*; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.lang.NotImplementedException; +import org.json.simple.JSONObject; + +import edu.illinois.cs.cogcomp.annotation.AnnotatorException; +import edu.illinois.cs.cogcomp.datalessclassification.config.DatalessConfigurator; +import edu.illinois.cs.cogcomp.datalessclassification.config.W2VDatalessConfigurator; +import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; +import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager; +import edu.illinois.cs.cogcomp.datalessclassification.representation.w2v.MemoryBasedW2V; +import edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer; +import edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A wrapper of Word2Vec-based Dataless Classifier for the cogcomp pipeline. + * + * @author shashank + */ + +public class W2VDatalessAnnotator extends ADatalessAnnotator { + private static String NAME = W2VDatalessAnnotator.class.getCanonicalName(); + private static Logger logger = LoggerFactory.getLogger(W2VDatalessAnnotator.class); + + public W2VDatalessAnnotator() { + this(new W2VDatalessConfigurator().getDefaultConfig()); + } + + public W2VDatalessAnnotator(ResourceManager config) { + super(ViewNames.DATALESS_W2V, config); + } + + public W2VDatalessAnnotator(ResourceManager config, JSONObject jsonHierarchy) + throws NotImplementedException { + super(ViewNames.DATALESS_W2V, true); + logger.info("Initializing LabelTree..."); + initializeLabelTree(jsonHierarchy); + logger.info("LabelTree Initialization Done."); + + logger.info("Initializing Embedding..."); + initializeEmbedding(config); + logger.info("Embedding Initialization Done."); + + logger.info("Initializing Classifier..."); + initializeClassifier(config); + logger.info("Classifier Initialization Done."); + + isInitialized = true; + } + + public W2VDatalessAnnotator(ResourceManager config, String hierarchyPath, String labelNameFile, + String labelDescFile) { + super(ViewNames.DATALESS_W2V, true); + logger.info("Initializing LabelTree..."); + initializeLabelTree(hierarchyPath, labelNameFile, labelDescFile); + logger.info("LabelTree Initialization Done."); + + logger.info("Initializing Embedding..."); + initializeEmbedding(config); + logger.info("Embedding Initialization Done."); + + logger.info("Initializing Classifier..."); + initializeClassifier(config); + logger.info("Classifier Initialization Done."); + + isInitialized = true; + } + + public W2VDatalessAnnotator(ResourceManager config, Set topNodes, + Map> childMap, Map labelNameMap, + Map labelDescMap) { + super(ViewNames.DATALESS_W2V, true); + logger.info("Initializing LabelTree..."); + initializeLabelTree(topNodes, childMap, labelNameMap, labelDescMap); + logger.info("LabelTree Initialization Done."); + + logger.info("Initializing Embedding..."); + initializeEmbedding(config); + logger.info("Embedding Initialization Done."); + + logger.info("Initializing Classifier..."); + initializeClassifier(config); + logger.info("Classifier Initialization Done."); + + isInitialized = true; + } + + @Override + protected String getClassName() { + return NAME; + } + + /** + * Initializes the Word2Vec Embedding that will be used for computing the representations + */ + protected void initializeEmbedding(ResourceManager config) { + conceptWeights = new HashMap<>(); + embedding_dim = config.getInt(W2VDatalessConfigurator.W2V_DIM); + embedding = new MemoryBasedW2V(config); + } + + /** + * @param args config: config file path testFile: Test File + */ + public static void main(String[] args) { + CommandLine cmd = ESADatalessAnnotator.getCMDOpts(args); + + ResourceManager rm; + + try { + String configFile = cmd.getOptionValue("config", "config/project.properties"); + ResourceManager nonDefaultRm = new ResourceManager(configFile); + + rm = new W2VDatalessConfigurator().getConfig(nonDefaultRm); + } catch (IOException e) { + rm = new W2VDatalessConfigurator().getDefaultConfig(); + } + + String testFile = cmd.getOptionValue("testFile", "data/graphicsTestDocument.txt"); + + StringBuilder sb = new StringBuilder(); + + String line; + + try(BufferedReader br = new BufferedReader(new FileReader(new File(testFile)))) { + while ((line = br.readLine()) != null) { + sb.append(line); + sb.append(" "); + } + + String text = sb.toString().trim(); + + TokenizerTextAnnotationBuilder taBuilder = + new TokenizerTextAnnotationBuilder(new StatefulTokenizer()); + TextAnnotation ta = taBuilder.createTextAnnotation(text); + + W2VDatalessAnnotator datalessAnnotator = new W2VDatalessAnnotator(rm); + datalessAnnotator.addView(ta); + + List annots = ta.getView(ViewNames.DATALESS_W2V).getConstituents(); + + System.out.println("Predicted LabelIDs:"); + + for (Constituent annot : annots) { + System.out.println(annot.getLabel()); + } + + Map labelNameMap = + DatalessAnnotatorUtils.getLabelNameMap(rm + .getString(DatalessConfigurator.LabelName_Path.key)); + + System.out.println("Predicted Labels:"); + + for (Constituent annot : annots) { + System.out.println(labelNameMap.get(annot.getLabel())); + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + logger.error("Test File not found at " + testFile + " ... exiting"); + System.exit(-1); + } catch (AnnotatorException e) { + e.printStackTrace(); + logger.error("Error Annotating the Test Document with the Dataless View ... exiting"); + System.exit(-1); + } catch (IOException e) { + e.printStackTrace(); + logger.error("IO Error while reading the test file ... exiting"); + System.exit(-1); + } + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/DenseVector.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/DenseVector.java new file mode 100755 index 000000000..1180b26aa --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/DenseVector.java @@ -0,0 +1,116 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.util; + +import java.util.Map; + +/** + * A general purpose DenseVector implementation + * + * @author yqsong@illinois.edu + * @author sgupta96 + */ + +public class DenseVector { + private double[] elems; + + public DenseVector() { + elems = new double[0]; + } + + public DenseVector(int dims) { + elems = new double[dims]; + } + + public DenseVector(double[] scores) { + this.elems = scores; + } + + public void incrementAll(double value) { + for (int i = 0; i < elems.length; i++) { + elems[i] += value; + } + } + + public void increment(int index, double value) { + if (index < size()) + elems[index] += value; + } + + public void scaleAll(double value) { + for (int i = 0; i < elems.length; i++) { + elems[i] *= value; + } + } + + public void scale(int index, double value) { + if (index < size()) + elems[index] *= value; + } + + public double[] getVector() { + return this.elems; + } + + public int size() { + return elems.length; + } + + public double getElementAt(int index) { + if (index < size()) + return elems[index]; + else + throw new ArrayIndexOutOfBoundsException("Desired index exceeds the size of the vector"); + } + + public void setElementAt(int index, double value) { + if (index < size()) + elems[index] = value; + else + throw new ArrayIndexOutOfBoundsException("Desired index exceeds the size of the vector"); + } + + public static DenseVector createDenseVector(SparseVector sparseVector) { + DenseVector denseVector = new DenseVector(); + + if (sparseVector == null) + return denseVector; + + int max = Integer.MIN_VALUE; + + Map map = sparseVector.keyValueMap; + + for (Integer key : map.keySet()) { + if (key > max) + max = key; + } + + double[] finalVector = new double[map.size()]; + + for (Integer key : map.keySet()) { + finalVector[key] = map.get(key); + } + + denseVector = new DenseVector(finalVector); + + return denseVector; + } + + public String toString() { + StringBuilder str = new StringBuilder(""); + + for (int i = 0; i < elems.length; i++) { + str.append(i); + str.append(","); + str.append(elems[i]); + str.append(";"); + } + + return str.toString(); + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/DenseVectorOperations.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/DenseVectorOperations.java new file mode 100644 index 000000000..3fa787e3f --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/DenseVectorOperations.java @@ -0,0 +1,106 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.util; + +import java.util.HashMap; + +/** + * A Collection of useful functions for working with {@link DenseVector} + * + * @author shashank + */ + +public class DenseVectorOperations { + + public static double getNorm(double[] vector) { + double norm = 0; + + for (double dim : vector) { + norm += dim * dim; + } + + norm = Math.sqrt(norm); + + return norm; + } + + public static double getNorm(DenseVector vector) { + double norm = getNorm(vector.getVector()); + return norm; + } + + public static double cosine(double[] vec1, double[] vec2) { + if (vec1.length != vec2.length) + throw new IllegalArgumentException( + "Cosine only allowed for vectors of equal length. Lengths of Vectors --> Vector1 = " + + vec1.length + ", Vector2 = " + vec2.length); + + double norm1 = getNorm(vec1); + double norm2 = getNorm(vec2); + + double dot = 0; + + for (int i = 0; i < vec1.length; i++) { + dot += vec1[i] * vec2[i]; + } + + return dot / ((norm1 + Double.MIN_NORMAL) * (norm2 + Double.MIN_NORMAL)); + } + + public static double cosine(DenseVector v1, DenseVector v2) { + double[] vec1 = v1.getVector(); + double[] vec2 = v2.getVector(); + + return cosine(vec1, vec2); + } + + public static double[] add(double[] vec1, double[] vec2) { + if (vec1.length != vec2.length) + throw new IllegalArgumentException( + "Addition only allowed for vectors of equal length. Lengths of Vectors --> Vector1 = " + + vec1.length + ", Vector2 = " + vec2.length); + + int size = vec1.length; + double[] sum = new double[size]; + + for (int i = 0; i < size; i++) { + sum[i] = vec1[i] + vec2[i]; + } + + return sum; + } + + public static DenseVector add(DenseVector vec1, DenseVector vec2) { + if (vec1.size() != vec2.size()) + throw new IllegalArgumentException( + "Addition only allowed for vectors of equal length. Lengths of Vectors --> Vector1 = " + + vec1.size() + ", Vector2 = " + vec2.size()); + + double[] scores = add(vec1.getVector(), vec2.getVector()); + + DenseVector sum = new DenseVector(scores); + return sum; + } + + public static SparseVector getSparseVector(DenseVector denseVector) { + SparseVector sparseVector = new SparseVector<>(); + + if (denseVector == null) + return sparseVector; + + HashMap finalMap = new HashMap<>(); + + for (int dim = 0; dim < denseVector.size(); dim++) { + finalMap.put(dim, denseVector.getElementAt(dim)); + } + + sparseVector.setVector(finalMap); + + return sparseVector; + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/HashSort.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/HashSort.java new file mode 100755 index 000000000..3d58f186f --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/HashSort.java @@ -0,0 +1,37 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.util; + +import java.util.Comparator; +import java.util.Map; +import java.util.TreeMap; + +/** + * yqsong@illinois.edu + */ + +public class HashSort { + // the map is sorted from the highest value to the lowest. + public static > TreeMap sortByValues(final Map map) { + Comparator valueComparator = new Comparator() { + public int compare(K k1, K k2) { + int compare = map.get(k2).compareTo(map.get(k1)); + if (compare == 0) + return 1; + else + return compare; + } + }; + + TreeMap sortedByValues = new TreeMap<>(valueComparator); + sortedByValues.putAll(map); + return sortedByValues; + } + + +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/LabelResultTree.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/LabelResultTree.java new file mode 100755 index 000000000..768df2f86 --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/LabelResultTree.java @@ -0,0 +1,66 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.util; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * This class is used by the Dataless Classifier for selecting labels while traversing Top-Down + * + * @author yqsong@illinois.edu + * @author shashank + */ + +public class LabelResultTree { + private LabelResultTreeNode rootNode; + + public LabelResultTree() { + rootNode = new LabelResultTreeNode(); + } + + public LabelResultTreeNode getRootNode() { + return rootNode; + } + + /** + * Returns a Map, where key is the depth, and value is a list of the metadata associated with the nodes at that depth + */ + public Map> getFullDepthPredictions() { + Map> depthLabelMap = new HashMap<>(); + + populateDepthPredictions(rootNode, 1, depthLabelMap); + + for (Integer depth : depthLabelMap.keySet()) { + Collections.sort(depthLabelMap.get(depth)); + Collections.reverse(depthLabelMap.get(depth)); + } + + return depthLabelMap; + } + + /** + * Recursively populates the DepthLabelMap + */ + private void populateDepthPredictions(LabelResultTreeNode root, double parentScore, Map> depthLabelMap) { + if (!depthLabelMap.containsKey(root.getDepth())) { + depthLabelMap.put(root.getDepth(), new ArrayList<>()); + } + + LabelScorePair labelScorePair = root.getLabelScorePair(); + labelScorePair.setScore(labelScorePair.getScore() * parentScore); + depthLabelMap.get(root.getDepth()).add(labelScorePair); + + for (LabelResultTreeNode childNode: root.getChildren()) { + populateDepthPredictions(childNode, 1, depthLabelMap); + } + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/LabelResultTreeNode.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/LabelResultTreeNode.java new file mode 100755 index 000000000..1e63caceb --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/LabelResultTreeNode.java @@ -0,0 +1,63 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.util; + +import java.util.ArrayList; +import java.util.List; + +/** + * The Node class used by {@link LabelResultTree} + * + * @author yqsong@illinois.edu + * @author shashank + */ + +public class LabelResultTreeNode { + private LabelScorePair labelScorePair; + private boolean isLeaf; + private int depth; + + private List children; + + public LabelResultTreeNode() { + isLeaf = false; + children = new ArrayList<>(); + } + + public LabelScorePair getLabelScorePair() { + return labelScorePair; + } + + public void setLabelScorePair(LabelScorePair labelScorePair) { + this.labelScorePair = labelScorePair; + } + + public boolean isLeaf() { + return isLeaf; + } + + public void setIsLeaf(boolean isLeaf) { + this.isLeaf = isLeaf; + } + + public int getDepth() { + return depth; + } + + public void setDepth(int depth) { + this.depth = depth; + } + + public List getChildren() { + return children; + } + + public void setChildren(List children) { + this.children = children; + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/LabelScorePair.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/LabelScorePair.java new file mode 100755 index 000000000..e1c5e6a38 --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/LabelScorePair.java @@ -0,0 +1,46 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.util; + +/** + * A small utility class to wrap a pair of labelID and its score + * + * @author yqsong@illinois.edu + * @author shashank + */ + +public class LabelScorePair implements Comparable { + public String labelID; + double labelScore; + + public LabelScorePair(String labelID, double score) { + this.labelID = labelID; + labelScore = score; + } + + public String getLabelID() { + return labelID; + } + + public double getScore() { + return labelScore; + } + + public void setLabelID(String labelID) { + this.labelID = labelID; + } + + public void setScore(double score) { + labelScore = score; + } + + @Override + public int compareTo(LabelScorePair kvp) { + return Double.compare(this.labelScore, kvp.labelScore); + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/QueryPreProcessor.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/QueryPreProcessor.java new file mode 100755 index 000000000..00b000a2c --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/QueryPreProcessor.java @@ -0,0 +1,60 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.util; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +public class QueryPreProcessor { + + private static Set stopSet = new HashSet<>(); + private static Set stopWords; + + public static String process(String query) { + + if (stopSet.size() == 0) { + stopSet = getStopWords(); + } + + StringBuffer newQuery = new StringBuffer(""); + + query = + query.toLowerCase().replaceAll(",", " ").replaceAll(":", " ") + .replaceAll("\\.", " "); + query = query.toLowerCase().replaceAll("\\?", " ").replaceAll("\\*", " "); + query = query.toLowerCase().replaceAll("\\[", " ").replaceAll("\\]", " "); + query = query.toLowerCase().replaceAll("\\(", " ").replaceAll("\\)", " "); + query = query.toLowerCase().replaceAll("\\{", " ").replaceAll("\\}", " "); + query = query.toLowerCase().replaceAll("\\<", " ").replaceAll("\\>", " "); + query = query.toLowerCase().replaceAll("\"", " "); + + String[] queryArray = query.split("\\s+"); + + for (String str : queryArray) { + if (!stopSet.contains(str.trim())) { + newQuery.append(str).append(" "); + } + } + + return newQuery.toString(); + } + + private static Set getStopWords() { + if (stopWords == null) { + stopWords = new HashSet<>(); + + stopWords.addAll(Arrays.asList("I", "a", "about", "an", "are", "as", "at", "be", "by", + "com", "de", "en", "for", "from", "how", "in", "is", "it", "la", "of", "on", + "or", "that", "the", "this", "to", "was", "what", "when", "where", "who", + "will", "with", "und", "the", "www")); + } + + return stopWords; + } +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/SparseVector.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/SparseVector.java new file mode 100755 index 000000000..9ab5f2116 --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/SparseVector.java @@ -0,0 +1,272 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.util; + +import java.io.Serializable; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * A general-purpose SparseVector implementation. + * + * @author yqsong@illinois.edu + * @author sgupta96 + */ + +public class SparseVector implements Serializable { + private static final long serialVersionUID = 7206236102813490658L; + + Map keyValueMap; + double norm; + + public SparseVector() { + keyValueMap = new HashMap<>(); + norm = 0; + } + + public SparseVector(List keys, List scores) { + setVector(keys, scores); + } + + public SparseVector(List keys, List scores, Map weights) { + setVector(keys, scores, weights); + } + + public SparseVector(Map map) { + setVector(map); + } + + public SparseVector(Map map, Map weights) { + setVector(map, weights); + } + + public SparseVector(SparseVector v) { + keyValueMap = v.keyValueMap; + norm = v.norm; + } + + public static SparseVector deepCopy(SparseVector thatVector) { + Map thisMap = new HashMap<>(); + Map thatMap = thatVector.getKeyValueMap(); + + for (K key : thatMap.keySet()) { + thisMap.put(key, thatMap.get(key)); + } + + SparseVector thisVector = new SparseVector<>(thisMap); + return thisVector; + } + + public void setVector(Map map) { + keyValueMap = map; + norm = SparseVectorOperations.getNorm(keyValueMap); + } + + public void setVector(Map map, Map weights) { + keyValueMap = map; + norm = SparseVectorOperations.getNorm(keyValueMap, weights); + } + + public void setVector(List keys, List scores) { + keyValueMap = createKeyValueHashMap(keys, scores); + norm = SparseVectorOperations.getNorm(keyValueMap); + } + + public void setVector(List keys, List scores, Map weights) { + if (weights == null) { + weights = new HashMap<>(); + } + + keyValueMap = createKeyValueHashMap(keys, scores); + norm = SparseVectorOperations.getNorm(keyValueMap, weights); + } + + public void incrementAll(double value) { + for (T key : keyValueMap.keySet()) { + keyValueMap.put(key, keyValueMap.get(key) + value); + } + + updateNorm(); + } + + /** + * TODO: Decide what to do when the key is not found + * Currently, it just adds the key to the map if it is not found + */ + public void increment(T key, double value) { + if (keyValueMap.containsKey(key)) + keyValueMap.put(key, keyValueMap.get(key) + value); + else + keyValueMap.put(key, value); + + updateNorm(); + } + + public void scaleAll(double value) { + for (T key : keyValueMap.keySet()) { + keyValueMap.put(key, keyValueMap.get(key) * value); + } + + updateNorm(); + } + + /** + * TODO: Decide what to do when the key is not found + * Currently, it just adds the key to the map if it is not found + */ + public void scale(T key, double value) { + if (keyValueMap.containsKey(key)) + keyValueMap.put(key, keyValueMap.get(key) * value); + else + keyValueMap.put(key, value); + + updateNorm(); + } + + public Map getKeyValueMap() { + return this.keyValueMap; + } + + public Set getKeys() { + return this.keyValueMap.keySet(); + } + + public double getNorm() { + return this.norm; + } + + public int size() { + return keyValueMap.size(); + } + + public void updateNorm() { + norm = SparseVectorOperations.getNorm(this.keyValueMap); + } + + public void updateNorm(Map weights) { + norm = SparseVectorOperations.getNorm(this.keyValueMap, weights); + } + + public static Comparator> decreasingScores() { + return new Comparator>() { + + public int compare(Map.Entry o1, Map.Entry o2) { + return -1 * (o1.getValue().compareTo(o2.getValue())); + } + }; + } + + public static Comparator> increasingScores() { + return new Comparator>() { + + public int compare(Map.Entry o1, Map.Entry o2) { + return o1.getValue().compareTo(o2.getValue()); + } + }; + } + + public static SparseVector getOrderedSparseVector( + SparseVector vector, Comparator> c, int topK) { + + SparseVector sparseVector = new SparseVector<>(); + + List> entries = + new LinkedList<>(vector.keyValueMap.entrySet()); + + Collections.sort(entries, c); + + Map sortedMap = new LinkedHashMap<>(); + + int i = 0; + + for (Map.Entry entry : entries) { + if (i >= topK) + break; + + i++; + + sortedMap.put(entry.getKey(), entry.getValue()); + } + + sparseVector.setVector(sortedMap); + return sparseVector; + } + + public static SparseVector getOrderedSparseVector( + SparseVector vector, Comparator> c) { + + int size = vector.size(); + return getOrderedSparseVector(vector, c, size); + } + + private static Map createKeyValueHashMap(List keys, List scores) { + Map keyValueMap = new HashMap<>(); + + for (int i = 0; i < keys.size(); i++) { + keyValueMap.put(keys.get(i), scores.get(i)); + } + + return keyValueMap; + } + + @SuppressWarnings("unchecked") + @Override + public boolean equals(Object o) { + double epsilon = 0.0001; + + if (!(o instanceof SparseVector)) + return false; + + SparseVector that = (SparseVector) o; + + if (this.size() != that.size()) + return false; + + Map thatMap = that.getKeyValueMap(); + + for (T key : keyValueMap.keySet()) { + if (!thatMap.containsKey(key)) + return false; + + Double thisVal = this.keyValueMap.get(key); + Double thatVal = thatMap.get(key); + + if (Math.abs(thisVal - thatVal) > epsilon) + return false; + } + + Double thisNorm = this.getNorm(); + Double thatNorm = that.getNorm(); + + if (Math.abs(thisNorm - thatNorm) > epsilon) + return false; + + return true; + } + + public String toString() { + StringBuilder str = new StringBuilder(""); + + for (T key : keyValueMap.keySet()) { + str.append(key.toString()); + str.append(","); + str.append(keyValueMap.get(key)); + str.append(";"); + } + + return str.toString(); + } + + // TODO: add a suitable hashCode function? +} diff --git a/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/SparseVectorOperations.java b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/SparseVectorOperations.java new file mode 100755 index 000000000..8d2a48c53 --- /dev/null +++ b/dataless-classifier/src/main/java/edu/illinois/cs/cogcomp/datalessclassification/util/SparseVectorOperations.java @@ -0,0 +1,428 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification.util; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Collection of useful functions for working with the {@link SparseVector} + * + * @author shashank + */ + +public class SparseVectorOperations { + + public static double getNorm(Map vector) { + Map weights = new HashMap<>(); + + return getNorm(vector, weights); + } + + public static double getNorm(Map vector, Map weights) { + if (weights == null) { + weights = new HashMap<>(); + } + + double norm = 0; + + for (T key : vector.keySet()) { + double value = vector.get(key); + double weight = 1; + + if (weights.containsKey(key)) + weight = weights.get(key); + + norm += value * value * weight; + } + + norm = Math.sqrt(norm); + + return norm; + } + + public static Map add(Map v1, Map v2) { + return add(v1, v2, 1, 1); + } + + public static Map addMultipleMaps(List> vectorList) { + Map finalVector = new HashMap<>(); + + for (Map vectorMap : vectorList) { + finalVector = add(finalVector, vectorMap, 1, 1); + } + + return finalVector; + } + + public static Map add(Map v1, Map v2, double weight1, + double weight2) { + Map sum = new HashMap<>(v1.size()); + + for (T key : v1.keySet()) { + if (v2.containsKey(key)) { + double value1 = v1.get(key) * weight1; + double value2 = v2.get(key) * weight2; + sum.put(key, value1 + value2); + } else { + double value1 = v1.get(key) * weight1; + sum.put(key, value1); + } + } + + for (T key : v2.keySet()) { + if (!v1.containsKey(key)) { + double value2 = v2.get(key) * weight2; + sum.put(key, value2); + } + } + + return sum; + } + + public static Map addMultipleMaps(List> vectorList, + List scoreList) { + Map finalVector = new HashMap<>(); + + for (int i = 0; i < vectorList.size(); i++) { + finalVector = add(finalVector, vectorList.get(i), 1, scoreList.get(i)); + } + + return finalVector; + } + + public static SparseVector add(SparseVector v1, + SparseVector v2) { + Map sum = add(v1.getKeyValueMap(), v2.getKeyValueMap()); + SparseVector sumVec = new SparseVector<>(sum); + + return sumVec; + } + + public static SparseVector addMultipleVectors( + List> vectorList) { + List> maps = new ArrayList<>(vectorList.size()); + + for (SparseVector vector : vectorList) { + maps.add(vector.getKeyValueMap()); + } + + Map sum = addMultipleMaps(maps); + SparseVector sumVec = new SparseVector<>(sum); + + return sumVec; + } + + public static SparseVector averageMultipleVectors( + List> vectorList) { + SparseVector sumVec = addMultipleVectors(vectorList); + sumVec.scaleAll(1.0 / vectorList.size()); + + return sumVec; + } + + public static SparseVector add(SparseVector v1, + SparseVector v2, double weight1, double weight2) { + Map sum = add(v1.getKeyValueMap(), v2.getKeyValueMap(), weight1, weight2); + SparseVector sumVec = new SparseVector<>(sum); + + return sumVec; + } + + public static SparseVector addMultipleVectors( + List> vectorList, List weightList) { + List> maps = new ArrayList<>(vectorList.size()); + + for (SparseVector vector : vectorList) { + maps.add(vector.getKeyValueMap()); + } + + Map sum = addMultipleMaps(maps, weightList); + SparseVector sumVec = new SparseVector<>(sum); + + return sumVec; + } + + public static SparseVector averageMultipleVectors( + List> vectorList, List weightList) { + SparseVector sumVec = addMultipleVectors(vectorList, weightList); + sumVec.scaleAll(1.0 / vectorList.size()); + + return sumVec; + } + + public static SparseVector add(SparseVector v1, + SparseVector v2, Map weights) { + Map sum = add(v1.getKeyValueMap(), v2.getKeyValueMap()); + SparseVector sumVec = new SparseVector<>(sum, weights); + + return sumVec; + } + + public static double cosine(Map vector1, Map vector2) { + Map weights = new HashMap<>(); + + return cosine(vector1, vector2, weights); + } + + public static double cosine(Map vector1, Map vector2, + Map weights) { + double norm1 = getNorm(vector1, weights); + double norm2 = getNorm(vector2, weights); + + return cosine(vector1, vector2, norm1, norm2, weights); + } + + public static double cosine(Map vector1, Map vector2, double norm1, + double norm2) { + Map weights = new HashMap<>(); + + return cosine(vector1, vector2, norm1, norm2, weights); + } + + public static double cosine(Map vector1, Map vector2, double norm1, + double norm2, Map weights) { + if (weights == null) { + weights = new HashMap<>(); + } + + double dot = 0; + + if (vector1.size() < vector2.size()) { + for (T key : vector1.keySet()) { + if (vector2.containsKey(key)) { + double value1 = vector1.get(key); + double value2 = vector2.get(key); + double weight = 1; + + if (weights.containsKey(key)) + weight = weights.get(key); + + dot += value1 * value2 * weight; + } + } + } else { + for (T key : vector2.keySet()) { + if (vector1.containsKey(key)) { + double value1 = vector1.get(key); + double value2 = vector2.get(key); + double weight = 1; + + if (weights.containsKey(key)) + weight = weights.get(key); + + dot += value1 * value2 * weight; + } + } + } + + return dot / (norm1 + Double.MIN_NORMAL) / (norm2 + Double.MIN_NORMAL); + } + + public static double cosine(SparseVector v1, SparseVector v2) { + return cosine(v1.getKeyValueMap(), v2.getKeyValueMap()); + } + + public static double cosine(SparseVector vector1, + SparseVector vector2, double norm1, double norm2) { + return cosine(vector1.getKeyValueMap(), vector2.getKeyValueMap(), norm1, norm2); + } + + public static double cosine(SparseVector v1, SparseVector v2, + Map weights) { + return cosine(v1.getKeyValueMap(), v2.getKeyValueMap(), weights); + } + + public static double cosine(SparseVector vector1, + SparseVector vector2, double norm1, double norm2, Map weights) { + return cosine(vector1.getKeyValueMap(), vector2.getKeyValueMap(), norm1, norm2, weights); + } + + + static double jaccard(SparseVector v1, SparseVector v2) { + Map vector1 = v1.getKeyValueMap(); + Map vector2 = v2.getKeyValueMap(); + return jaccard(vector1, vector2); + } + + // TODO: Check this function + static double jaccard(Map vector1, Map vector2) { + Set set1 = new HashSet<>(vector1.keySet()); + Set set2 = vector2.keySet(); + set1.retainAll(set2); + + int overlap = set1.size(); + + return ((double) overlap) / (set1.size() + set2.size()); + } + + static double SkewDivergence(SparseVector vector1, + SparseVector vector2, double gamma) { + return SkewDivergence(vector1.getKeyValueMap(), vector2.getKeyValueMap(), gamma); + } + + // TODO: Check this function + static double SkewDivergence(Map vector1, Map vector2, + double gamma) { + double result = 0.0; + + // combine two vectors and get a middle + Map middleVector = new HashMap(); + + double sumV1 = 0; + + for (T key : vector1.keySet()) { + sumV1 += vector1.get(key); + } + + for (T key : vector1.keySet()) { + vector1.put(key, vector1.get(key) / sumV1); + } + + double sumV2 = 0; + + for (T key : vector2.keySet()) { + sumV2 += vector2.get(key); + } + + for (T key : vector2.keySet()) { + vector2.put(key, vector2.get(key) / sumV2); + } + + for (T key : vector2.keySet()) { + double value2 = vector2.get(key); + + if (vector1.containsKey(key)) { + double value1 = vector1.get(key); + middleVector.put(key, (gamma * value1 + (1 - gamma) * value2)); + } else { + middleVector.put(key, (1 - gamma) * value2); + } + } + + for (T key : vector1.keySet()) { + if (!middleVector.containsKey(key)) { + double value1 = vector1.get(key); + middleVector.put(key, gamma * value1); + } + } + + double kld1 = KLDivergence(vector1, middleVector); + result = (Double.MIN_VALUE + kld1); + + if (result == 0) + return 0; + else + return 1 / result; + } + + static double JensenShannon(SparseVector vector1, + SparseVector vector2) { + return JensenShannon(vector1.getKeyValueMap(), vector2.getKeyValueMap()); + } + + // TODO: Check this function + static double JensenShannon(Map vector1, Map vector2) { + double result = 0.0; + + // combine two vectors and get a middle + Map middleVector = new HashMap(); + + double sumV1 = 0; + + for (T key : vector1.keySet()) { + sumV1 += vector1.get(key); + } + + for (T key : vector1.keySet()) { + vector1.put(key, vector1.get(key) / sumV1); + } + + double sumV2 = 0; + + for (T key : vector2.keySet()) { + sumV2 += vector2.get(key); + } + + for (T key : vector2.keySet()) { + vector2.put(key, vector2.get(key) / sumV2); + } + + for (T key : vector2.keySet()) { + double value2 = vector2.get(key); + + if (vector1.containsKey(key)) { + double value1 = vector1.get(key); + middleVector.put(key, (value1 + value2) / 2); + } else { + middleVector.put(key, value2 / 2); + } + } + + for (T key : vector1.keySet()) { + if (!middleVector.containsKey(key)) { + double value1 = vector1.get(key); + middleVector.put(key, value1 / 2); + } + } + + + // result = (Double.MIN_VALUE + (KLDivergence(vector1, vector2)) / 2); + // result = (Double.MIN_VALUE + (KLDivergence(vector2, vector1)) / 2); + + // result = (Double.MIN_VALUE + (KLDivergence(vector1, middleVector)) / 2); + // result = (Double.MIN_VALUE + (KLDivergence(vector2, middleVector)) / 2); + + double kld1 = KLDivergence(vector1, middleVector); + double kld2 = KLDivergence(vector2, middleVector); + result = (Double.MIN_VALUE + (kld1 + kld2) / 2); + + if (result == 0) + return 0; + else + return 1 / result; + } + + static double KLDivergence(SparseVector vector1, + SparseVector vector2) { + return KLDivergence(vector1.getKeyValueMap(), vector2.getKeyValueMap()); + } + + // TODO: Check this function + static double KLDivergence(Map vector1, Map vector2) { + double result = 0.0; + + if (vector1.size() == 0 || vector2.size() == 0) + return 0; + + double tempValue = 0.0; // save p(i)*log(p(i)/q(i)) + + for (T key : vector2.keySet()) { // traverse the longer vector + double value2 = vector2.get(key); + + if (vector1.containsKey(key)) { // find key in another vector + double value1 = vector1.get(key); + tempValue = value1 * Math.log(value1 / value2) / Math.log(2); + + // String newKey = key + ""; + // if (pageTitleIDMap.containsKey(key)) { + // newKey = pageTitleIDMap.get(key); + // } + + result += tempValue; + } + } + + return result; + } +} diff --git a/dataless-classifier/src/main/resources/config/project.properties b/dataless-classifier/src/main/resources/config/project.properties new file mode 100644 index 000000000..ced626a63 --- /dev/null +++ b/dataless-classifier/src/main/resources/config/project.properties @@ -0,0 +1,16 @@ +## Use ResourceManager to read these properties +# curatorHost = trollope.cs.illinois.edu +# curatorPort = 9010 + +## Target Label Hierarchy +labelHierarchyPath = data/hierarchies/20newsgroups/parentChildIdMap.txt +labelNamePath = data/hierarchies/20newsgroups/idToLabelNameMap.txt +labelDescPath = data/hierarchies/20newsgroups/labelDesc_Kws_simple.txt +# labelDescPath = data/hierarchies/20newsgroups/labelDesc_Kws_embellished.txt + +## Classifier configuration +inferenceBottomUp = True +classifierThreshold = 0.99 +classifierLeastK = 1 +classifierMaxK = 3 + diff --git a/dataless-classifier/src/main/resources/hierarchies/20newsgroups/idToLabelNameMap.txt b/dataless-classifier/src/main/resources/hierarchies/20newsgroups/idToLabelNameMap.txt new file mode 100644 index 000000000..48034a682 --- /dev/null +++ b/dataless-classifier/src/main/resources/hierarchies/20newsgroups/idToLabelNameMap.txt @@ -0,0 +1,26 @@ +politics politics +religion religion +computer computer +autos.sports autos.sports +science science +sales sales +talk.politics.guns talk.politics.guns +talk.politics.mideast talk.politics.mideast +talk.politics.misc talk.politics.misc +alt.atheism alt.atheism +soc.religion.christian soc.religion.christian +talk.religion.misc talk.religion.misc +comp.sys.ibm.pc.hardware comp.sys.ibm.pc.hardware +comp.sys.mac.hardware comp.sys.mac.hardware +comp.graphics comp.graphics +comp.windows.x comp.windows.x +comp.os.ms.windows.misc comp.os.ms.windows.misc +rec.autos rec.autos +rec.motorcycles rec.motorcycles +rec.sport.baseball rec.sport.baseball +rec.sport.hockey rec.sport.hockey +sci.electronics sci.electronics +sci.crypt sci.crypt +sci.med sci.med +sci.space sci.space +misc.forsale misc.forsale diff --git a/dataless-classifier/src/main/resources/hierarchies/20newsgroups/labelDesc_Kws_embellished.txt b/dataless-classifier/src/main/resources/hierarchies/20newsgroups/labelDesc_Kws_embellished.txt new file mode 100644 index 000000000..c2b10da6f --- /dev/null +++ b/dataless-classifier/src/main/resources/hierarchies/20newsgroups/labelDesc_Kws_embellished.txt @@ -0,0 +1,26 @@ +politics politics gun fbi guns weapon compound israel arab jews jewish muslim gay homosexual sexual +religion religion atheist christian atheism god islamic christian god christ church bible jesus christian morality jesus god religion horus +computer computer bus pc motherboard bios board computer dos mac apple powerbook graphics image gif animation tiff window motif xterm sun windows windows dos microsoft ms driver drivers card printer +autos.sports autos.sports car ford auto toyota honda nissan bmw bike motorcycle yamaha baseball ball hitter hockey wings espn +science science circuit electronics radio signal battery encryption key crypto algorithm security doctor medical disease medicine patient space orbit moon earth sky solar +sales sales sale offer shipping forsale sell price brand obo +talk.politics.guns gun fbi guns weapon compound +talk.politics.mideast israel arab jews jewish muslim +talk.politics.misc gay homosexual sexual +alt.atheism atheist christian atheism god islamic +soc.religion.christian christian god christ church bible jesus +talk.religion.misc christian morality jesus god religion horus +comp.sys.ibm.pc.hardware bus pc motherboard bios board computer dos +comp.sys.mac.hardware mac apple powerbook +comp.graphics graphics image gif animation tiff +comp.windows.x window motif xterm sun windows +comp.os.ms.windows.misc windows dos microsoft ms driver drivers card printer +rec.autos car ford auto toyota honda nissan bmw +rec.motorcycles bike motorcycle yamaha +rec.sport.baseball baseball ball hitter +rec.sport.hockey hockey wings espn +sci.electronics circuit electronics radio signal battery +sci.crypt encryption key crypto algorithm security +sci.med doctor medical disease medicine patient +sci.space space orbit moon earth sky solar +misc.forsale sale offer shipping forsale sell price brand obo diff --git a/dataless-classifier/src/main/resources/hierarchies/20newsgroups/labelDesc_Kws_simple.txt b/dataless-classifier/src/main/resources/hierarchies/20newsgroups/labelDesc_Kws_simple.txt new file mode 100644 index 000000000..0c0a7ce7b --- /dev/null +++ b/dataless-classifier/src/main/resources/hierarchies/20newsgroups/labelDesc_Kws_simple.txt @@ -0,0 +1,26 @@ +politics politics politics guns politics mideast politics +religion religion atheism society religion christianity christian religion +computer computer computer systems ibm pc hardware computer systems mac macintosh apple hardware computer graphics computer windows x windowsx computer os operating system microsoft windows +autos.sports autos.sports cars motorcycles baseball hockey +science science science electronics science cryptography medicine science space +sales sales for sale discount +talk.politics.guns politics guns +talk.politics.mideast politics mideast +talk.politics.misc politics +alt.atheism atheism +soc.religion.christian society religion christianity christian +talk.religion.misc religion +comp.sys.ibm.pc.hardware computer systems ibm pc hardware +comp.sys.mac.hardware computer systems mac macintosh apple hardware +comp.graphics computer graphics +comp.windows.x computer windows x windowsx +comp.os.ms.windows.misc computer os operating system microsoft windows +rec.autos cars +rec.motorcycles motorcycles +rec.sport.baseball baseball +rec.sport.hockey hockey +sci.electronics science electronics +sci.crypt science cryptography +sci.med science medicine +sci.space science space +misc.forsale for sale discount diff --git a/dataless-classifier/src/main/resources/hierarchies/20newsgroups/parentChildIdMap.txt b/dataless-classifier/src/main/resources/hierarchies/20newsgroups/parentChildIdMap.txt new file mode 100644 index 000000000..5c49b8bbd --- /dev/null +++ b/dataless-classifier/src/main/resources/hierarchies/20newsgroups/parentChildIdMap.txt @@ -0,0 +1,7 @@ +politics religion computer autos.sports science sales +politics talk.politics.guns talk.politics.mideast talk.politics.misc +religion alt.atheism soc.religion.christian talk.religion.misc +computer comp.sys.ibm.pc.hardware comp.sys.mac.hardware comp.graphics comp.windows.x comp.os.ms.windows.misc +autos.sports rec.autos rec.motorcycles rec.sport.baseball rec.sport.hockey +science sci.electronics sci.crypt sci.med sci.space +sales misc.forsale \ No newline at end of file diff --git a/dataless-classifier/src/main/resources/license-header.txt b/dataless-classifier/src/main/resources/license-header.txt new file mode 100644 index 000000000..2149f7141 --- /dev/null +++ b/dataless-classifier/src/main/resources/license-header.txt @@ -0,0 +1,5 @@ +This software is released under the University of Illinois/Research and Academic Use License. See +the LICENSE file in the root folder for details. Copyright (c) 2016 + +Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign +http://cogcomp.cs.illinois.edu/ \ No newline at end of file diff --git a/dataless-classifier/src/main/resources/log4j.properties b/dataless-classifier/src/main/resources/log4j.properties new file mode 100644 index 000000000..ea57ac43b --- /dev/null +++ b/dataless-classifier/src/main/resources/log4j.properties @@ -0,0 +1,8 @@ +# Root logger option +log4j.rootLogger=INFO, stdout + +# Direct log messages to stdout +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.Target=System.out +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout +log4j.appender.stdout.layout.ConversionPattern=%d{HH:mm:ss} %-5p %c{1}:%L - %m%n \ No newline at end of file diff --git a/dataless-classifier/src/main/resources/sampleDocument.txt b/dataless-classifier/src/main/resources/sampleDocument.txt new file mode 100644 index 000000000..f7e5d430f --- /dev/null +++ b/dataless-classifier/src/main/resources/sampleDocument.txt @@ -0,0 +1 @@ +i m looking for some recommendations for screen capture programs a couple of issues ago pc mag listed as editor s choices both conversion artist and hijaak for windows anyone have any experience with those or some others i m trying to get an alpha manual in the next few days and i m not making much progress with the screen shots i m currently using dodot and i m about to burn it and the disks it rode it on it s got a lot of freaky bugs and oversights that are driving me crazy tonight it decided that for any graphic it writes out as a tiff file that s under a certain arbitrary size it will swap the left and right sides of the picture usually it confines itself to not copying things to the clipboard so i have to save and load pix for editing in paintbrush or crashing every hour or so the one nice thing it has though is it s dither option you d think that this would turn colors into dots which it does if you go from say colors to colors but if you go from or colors to b w you can set a threshold level for which colors turn to black and which turn to white for me this is useful because i can turn light grays on buttons to white and the dark grays to black and thereby preserve the d effect on buttons and other parts of the window if you understood my description can you tell me if another less buggy program can do this as well much thanks for any help signature david delgreco what lies behind us and what lies technically a writer before us are tiny matters compared delgreco rahul net to what lies within us oliver wendell holmes david f delgreco delgreco rahul net recommendation for screen capture program diff --git a/dataless-classifier/src/main/resources/testFiles/electronicsTestDocument.txt b/dataless-classifier/src/main/resources/testFiles/electronicsTestDocument.txt new file mode 100644 index 000000000..799d39f7a --- /dev/null +++ b/dataless-classifier/src/main/resources/testFiles/electronicsTestDocument.txt @@ -0,0 +1 @@ +yes i know it s nowhere near christmas time but i m gonna loose net access in a few days maybe a week or if i m lucky and wanted to post this for interested people to save till xmas note bell labs is a good place if you have a phd and a good boss i have neither subject xmas light set with levels of brightness another version of a variable brightness xmas light set this set starts with a blinker bulb string diagram orginal way set 0v b b 0rtn modified set for level brightness string 0v 0k w string b 0v rtn note no mods to wiring to the right of this point only one blinker is used note that the blinker would not have as much current thru it as the string bulbs because of the second string of bulbs in parallel with it that s why the use of the 0k w resistor here to add extra current thru the blinker to make up for the current shunted thru the second string while the blinker is glowing and the second string is not glowing when the blinker goes open this resistor has only a slight effect on the brightness of the strings s slightly dimmer s slightly brighter or use a w 0v bulb in place of the 0k resistor if you can get one caution do not replace with a standard c bulb as these draw too much current and burn out the blinker c approx w what you ll see when it s working powerup string will light at full brightness and b will be lit bypassing most of the current from the second string making them not light b will open placing both strings in series making the string that was out to glow at a low brightness and the other string that was on before to glow at reduced brightness be sure to wire and insulate the splices resistor leads and cut wires in a safe manner level brightness xmas light set for easter diff --git a/dataless-classifier/src/main/resources/testFiles/graphicsTestDocument.txt b/dataless-classifier/src/main/resources/testFiles/graphicsTestDocument.txt new file mode 100644 index 000000000..f7e5d430f --- /dev/null +++ b/dataless-classifier/src/main/resources/testFiles/graphicsTestDocument.txt @@ -0,0 +1 @@ +i m looking for some recommendations for screen capture programs a couple of issues ago pc mag listed as editor s choices both conversion artist and hijaak for windows anyone have any experience with those or some others i m trying to get an alpha manual in the next few days and i m not making much progress with the screen shots i m currently using dodot and i m about to burn it and the disks it rode it on it s got a lot of freaky bugs and oversights that are driving me crazy tonight it decided that for any graphic it writes out as a tiff file that s under a certain arbitrary size it will swap the left and right sides of the picture usually it confines itself to not copying things to the clipboard so i have to save and load pix for editing in paintbrush or crashing every hour or so the one nice thing it has though is it s dither option you d think that this would turn colors into dots which it does if you go from say colors to colors but if you go from or colors to b w you can set a threshold level for which colors turn to black and which turn to white for me this is useful because i can turn light grays on buttons to white and the dark grays to black and thereby preserve the d effect on buttons and other parts of the window if you understood my description can you tell me if another less buggy program can do this as well much thanks for any help signature david delgreco what lies behind us and what lies technically a writer before us are tiny matters compared delgreco rahul net to what lies within us oliver wendell holmes david f delgreco delgreco rahul net recommendation for screen capture program diff --git a/dataless-classifier/src/test/java/edu/illinois/cs/cogcomp/datalessclassification/ESADatalessTest.java b/dataless-classifier/src/test/java/edu/illinois/cs/cogcomp/datalessclassification/ESADatalessTest.java new file mode 100644 index 000000000..f2a6ebbbc --- /dev/null +++ b/dataless-classifier/src/test/java/edu/illinois/cs/cogcomp/datalessclassification/ESADatalessTest.java @@ -0,0 +1,190 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification; + +import org.junit.Test; + +import edu.illinois.cs.cogcomp.annotation.AnnotatorException; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; +import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager; +import edu.illinois.cs.cogcomp.datalessclassification.config.ESADatalessConfigurator; +import edu.illinois.cs.cogcomp.datalessclassification.ta.ADatalessAnnotator; +import edu.illinois.cs.cogcomp.datalessclassification.ta.ESADatalessAnnotator; +import edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer; +import edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.*; + +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +/** + * @author shashank + */ +public class ESADatalessTest { + private String configFile; + private ESADatalessAnnotator dataless; + + private List documents; + private List> docLabels; + + @Test + public void testPredictions() { + try { + configFile = "config/project.properties"; + + ResourceManager nonDefaultRm = new ResourceManager(configFile); + ResourceManager rm = new ESADatalessConfigurator().getConfig(nonDefaultRm); + dataless = new ESADatalessAnnotator(rm); + + documents = new ArrayList<>(); + String doc1 = + "i m looking for some recommendations for screen capture programs a couple" + + " of issues ago pc mag listed as editor s choices both conversion artist" + + " and hijaak for windows anyone have any experience with those or some others" + + " i m trying to get an alpha manual in the next few days and i m not making much" + + " progress with the screen shots i m currently using dodot and i m about to burn it" + + " and the disks it rode it on it s got a lot of freaky bugs and oversights that are " + + "driving me crazy tonight it decided that for any graphic it writes out as a tiff " + + "file that s under a certain arbitrary size it will swap the left and right sides of" + + " the picture usually it confines itself to not copying things to the clipboard so i " + + "have to save and load pix for editing in paintbrush or crashing every hour or so the " + + "one nice thing it has though is it s dither option you d think that this would turn " + + "colors into dots which it does if you go from say colors to colors but if you go " + + "from or colors to b w you can set a threshold level for which colors turn to black " + + "and which turn to white for me this is useful because i can turn light grays on buttons" + + " to white and the dark grays to black and thereby preserve the d effect on buttons and " + + "other parts of the window if you understood my description can you tell me if another " + + "less buggy program can do this as well much thanks for any help signature david delgreco " + + "what lies behind us and what lies technically a writer before us are tiny matters compared " + + "delgreco rahul net to what lies within us oliver wendell holmes david f delgreco delgreco rahul " + + "net recommendation for screen capture program"; + documents.add(doc1); + + String doc2 = + "yes i know it s nowhere near christmas time but i m gonna loose net access in a few days maybe " + + "a week or if i m lucky and wanted to post this for interested people to save till xmas " + + "note bell labs is a good place if you have a phd and a good boss i have neither subject " + + "xmas light set with levels of brightness another version of a variable brightness xmas " + + "light set this set starts with a blinker bulb string diagram orginal way set 0v b b " + + "0rtn modified set for level brightness string 0v 0k w string b 0v rtn note no mods to " + + "wiring to the right of this point only one blinker is used note that the blinker " + + "would not have as much current thru it as the string bulbs because of the second " + + "string of bulbs in parallel with it that s why the use of the 0k w resistor here to " + + "add extra current thru the blinker to make up for the current shunted thru the second " + + "string while the blinker is glowing and the second string is not glowing when the " + + "blinker goes open this resistor has only a slight effect on the brightness of the " + + "strings s slightly dimmer s slightly brighter or use a w 0v bulb in place of the 0k " + + "resistor if you can get one caution do not replace with a standard c bulb as these " + + "draw too much current and burn out the blinker c approx w what you ll see when it s " + + "working powerup string will light at full brightness and b will be lit bypassing most " + + "of the current from the second string making them not light b will open placing both " + + "strings in series making the string that was out to glow at a low brightness and the " + + "other string that was on before to glow at reduced brightness be sure to wire and insulate" + + " the splices resistor leads and cut wires in a safe manner level brightness xmas light " + + "set for easter"; + documents.add(doc2); + + docLabels = new ArrayList<>(); + Set docLabels1 = new HashSet<>(Arrays.asList("computer", "comp.graphics")); + docLabels.add(docLabels1); + + Set docLabels2 = new HashSet<>(Arrays.asList("autos.sports", "rec.autos")); + docLabels.add(docLabels2); + } catch (IOException e) { + e.printStackTrace(); + System.out.println("IO Error while initializing the annotator .. " + e.getMessage()); + fail("IO Error while initializing the annotator .. " + e.getMessage()); + } + + try { + for (int i = 0; i < documents.size(); i++) { + // String docText = getDocumentText(docPaths.get(i)); + String docText = documents.get(i); + Set docPredictions = getPredictions(getTextAnnotation(docText), dataless); + + System.out.println("Doc" + i + ": Gold LabelIDs:"); + for (String goldLabel : docLabels.get(i)) { + System.out.println(goldLabel); + } + System.out.println("Doc" + i + ": Predicted LabelIDs:"); + + for (String predictedLabel : docPredictions) { + System.out.println(predictedLabel); + } + + assertTrue(checkSetEquality(docLabels.get(i), docPredictions)); + System.out.println(); + } + } catch (AnnotatorException e) { + e.printStackTrace(); + System.out.println("Error annotating the document .. " + e.getMessage()); + fail("Error annotating the document .. " + e.getMessage()); + } + } + + private boolean checkSetEquality(Set goldLabels, Set predictedLabels) { + if (goldLabels.size() != predictedLabels.size()) + return false; + + for (String goldLabel : goldLabels) { + if (predictedLabels.contains(goldLabel) == false) + return false; + } + + return true; + } + + private String getDocumentText(String testFile){ + try(BufferedReader br = new BufferedReader(new FileReader(new File(testFile)))) { + + StringBuilder sb = new StringBuilder(); + + String line; + while ((line = br.readLine()) != null) { + sb.append(line); + sb.append(" "); + } + + String text = sb.toString().trim(); + + return text; + } catch (IOException e) { + e.printStackTrace(); + System.err.println("IO Error while reading the test file from " + testFile + " .. " + e.getMessage()); + throw new RuntimeException("IO Error while reading the test file from " + testFile + " .. " + e.getMessage()); + } + } + + private TextAnnotation getTextAnnotation(String text) { + TokenizerTextAnnotationBuilder taBuilder = + new TokenizerTextAnnotationBuilder(new StatefulTokenizer()); + TextAnnotation ta = taBuilder.createTextAnnotation(text); + + return ta; + } + + private Set getPredictions(TextAnnotation ta, ADatalessAnnotator annotator) + throws AnnotatorException { + List annots = annotator.getView(ta).getConstituents(); + + Set predictedLabels = new HashSet<>(); + + for (Constituent annot : annots) { + String label = annot.getLabel(); + predictedLabels.add(label); + } + + return predictedLabels; + } +} diff --git a/dataless-classifier/src/test/java/edu/illinois/cs/cogcomp/datalessclassification/W2VDatalessTest.java b/dataless-classifier/src/test/java/edu/illinois/cs/cogcomp/datalessclassification/W2VDatalessTest.java new file mode 100644 index 000000000..647ddb63c --- /dev/null +++ b/dataless-classifier/src/test/java/edu/illinois/cs/cogcomp/datalessclassification/W2VDatalessTest.java @@ -0,0 +1,192 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.datalessclassification; + +import org.junit.Test; + +import edu.illinois.cs.cogcomp.annotation.AnnotatorException; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent; +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation; +import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager; +import edu.illinois.cs.cogcomp.datalessclassification.config.W2VDatalessConfigurator; +import edu.illinois.cs.cogcomp.datalessclassification.ta.ADatalessAnnotator; +import edu.illinois.cs.cogcomp.datalessclassification.ta.W2VDatalessAnnotator; +import edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer; +import edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.*; + +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +/** + * @author shashank + */ +public class W2VDatalessTest { + private String configFile; + private W2VDatalessAnnotator dataless; + + private List documents; + private List> docLabels; + + @Test + public void testPredictions() { + try { + configFile = "config/project.properties"; + + ResourceManager nonDefaultRm = new ResourceManager(configFile); + ResourceManager rm = new W2VDatalessConfigurator().getConfig(nonDefaultRm); + dataless = new W2VDatalessAnnotator(rm); + + documents = new ArrayList<>(); + String doc1 = + "i m looking for some recommendations for screen capture programs a couple" + + " of issues ago pc mag listed as editor s choices both conversion artist" + + " and hijaak for windows anyone have any experience with those or some others" + + " i m trying to get an alpha manual in the next few days and i m not making much" + + " progress with the screen shots i m currently using dodot and i m about to burn it" + + " and the disks it rode it on it s got a lot of freaky bugs and oversights that are " + + "driving me crazy tonight it decided that for any graphic it writes out as a tiff " + + "file that s under a certain arbitrary size it will swap the left and right sides of" + + " the picture usually it confines itself to not copying things to the clipboard so i " + + "have to save and load pix for editing in paintbrush or crashing every hour or so the " + + "one nice thing it has though is it s dither option you d think that this would turn " + + "colors into dots which it does if you go from say colors to colors but if you go " + + "from or colors to b w you can set a threshold level for which colors turn to black " + + "and which turn to white for me this is useful because i can turn light grays on buttons" + + " to white and the dark grays to black and thereby preserve the d effect on buttons and " + + "other parts of the window if you understood my description can you tell me if another " + + "less buggy program can do this as well much thanks for any help signature david delgreco " + + "what lies behind us and what lies technically a writer before us are tiny matters compared " + + "delgreco rahul net to what lies within us oliver wendell holmes david f delgreco delgreco rahul " + + "net recommendation for screen capture program"; + documents.add(doc1); + + String doc2 = + "yes i know it s nowhere near christmas time but i m gonna loose net access in a few days maybe " + + "a week or if i m lucky and wanted to post this for interested people to save till xmas " + + "note bell labs is a good place if you have a phd and a good boss i have neither subject " + + "xmas light set with levels of brightness another version of a variable brightness xmas " + + "light set this set starts with a blinker bulb string diagram orginal way set 0v b b " + + "0rtn modified set for level brightness string 0v 0k w string b 0v rtn note no mods to " + + "wiring to the right of this point only one blinker is used note that the blinker " + + "would not have as much current thru it as the string bulbs because of the second " + + "string of bulbs in parallel with it that s why the use of the 0k w resistor here to " + + "add extra current thru the blinker to make up for the current shunted thru the second " + + "string while the blinker is glowing and the second string is not glowing when the " + + "blinker goes open this resistor has only a slight effect on the brightness of the " + + "strings s slightly dimmer s slightly brighter or use a w 0v bulb in place of the 0k " + + "resistor if you can get one caution do not replace with a standard c bulb as these " + + "draw too much current and burn out the blinker c approx w what you ll see when it s " + + "working powerup string will light at full brightness and b will be lit bypassing most " + + "of the current from the second string making them not light b will open placing both " + + "strings in series making the string that was out to glow at a low brightness and the " + + "other string that was on before to glow at reduced brightness be sure to wire and insulate" + + " the splices resistor leads and cut wires in a safe manner level brightness xmas light " + + "set for easter"; + documents.add(doc2); + + docLabels = new ArrayList<>(); + Set docLabels1 = + new HashSet<>(Arrays.asList("computer", "comp.os.ms.windows.misc")); + docLabels.add(docLabels1); + + Set docLabels2 = new HashSet<>(Arrays.asList("computer", "comp.windows.x")); + docLabels.add(docLabels2); + } catch (IOException e) { + e.printStackTrace(); + System.out.println("IO Error while initializing the annotator .. " + e.getMessage()); + fail("IO Error while initializing the annotator .. " + e.getMessage()); + } + + try { + for (int i = 0; i < documents.size(); i++) { + // String docText = getDocumentText(docPaths.get(i)); + String docText = documents.get(i); + Set docPredictions = getPredictions(getTextAnnotation(docText), dataless); + + System.out.println("Doc" + i + ": Gold LabelIDs:"); + for (String goldLabel : docLabels.get(i)) { + System.out.println(goldLabel); + } + + System.out.println("Doc" + i + ": Predicted LabelIDs:"); + + for (String predictedLabel : docPredictions) { + System.out.println(predictedLabel); + } + + System.out.println(); + assertTrue(checkSetEquality(docLabels.get(i), docPredictions)); + } + } catch (AnnotatorException e) { + e.printStackTrace(); + System.out.println("Error annotating the document .. " + e.getMessage()); + fail("Error annotating the document .. " + e.getMessage()); + } + } + + private boolean checkSetEquality(Set goldLabels, Set predictedLabels) { + if (goldLabels.size() != predictedLabels.size()) + return false; + + for (String goldLabel : goldLabels) { + if (predictedLabels.contains(goldLabel) == false) + return false; + } + + return true; + } + + private String getDocumentText(String testFile) { + try(BufferedReader br = new BufferedReader(new FileReader(new File(testFile)))) { + + StringBuilder sb = new StringBuilder(); + + String line; + while ((line = br.readLine()) != null) { + sb.append(line); + sb.append(" "); + } + + String text = sb.toString().trim(); + + return text; + } catch (IOException e) { + e.printStackTrace(); + System.err.println("IO Error while reading the test file from " + testFile + " .. " + e.getMessage()); + throw new RuntimeException("IO Error while reading the test file from " + testFile + " .. " + e.getMessage()); + } + } + + private TextAnnotation getTextAnnotation(String text) { + TokenizerTextAnnotationBuilder taBuilder = + new TokenizerTextAnnotationBuilder(new StatefulTokenizer()); + TextAnnotation ta = taBuilder.createTextAnnotation(text); + + return ta; + } + + private Set getPredictions(TextAnnotation ta, ADatalessAnnotator annotator) + throws AnnotatorException { + List annots = annotator.getView(ta).getConstituents(); + + Set predictedLabels = new HashSet<>(); + + for (Constituent annot : annots) { + String label = annot.getLabel(); + predictedLabels.add(label); + } + + return predictedLabels; + } +} diff --git a/pipeline/pom.xml b/pipeline/pom.xml index 1e30f9d76..961884fae 100644 --- a/pipeline/pom.xml +++ b/pipeline/pom.xml @@ -95,7 +95,11 @@ illinois-relation-extraction 4.0.0 - + + edu.illinois.cs.cogcomp + illinois-datalessclassification + 4.0.0 + edu.illinois.cs.cogcomp illinois-srl diff --git a/pipeline/src/main/java/edu/illinois/cs/cogcomp/pipeline/common/PipelineConfigurator.java b/pipeline/src/main/java/edu/illinois/cs/cogcomp/pipeline/common/PipelineConfigurator.java index 55fca262c..df8657d57 100644 --- a/pipeline/src/main/java/edu/illinois/cs/cogcomp/pipeline/common/PipelineConfigurator.java +++ b/pipeline/src/main/java/edu/illinois/cs/cogcomp/pipeline/common/PipelineConfigurator.java @@ -48,6 +48,8 @@ public class PipelineConfigurator extends AnnotatorServiceConfigurator { public static final Property USE_SRL_INTERNAL_PREPROCESSOR = new Property( SrlConfigurator.INSTANTIATE_PREPROCESSOR.key, FALSE); public static final Property USE_TIMEX3 = new Property("useTimex3", FALSE); + public static final Property USE_DATALESS_ESA = new Property("useDatalessESA", FALSE); + public static final Property USE_DATALESS_W2V = new Property("useDatalessW2V", FALSE); /** * if 'true', the PipelineFactory will return a sentence-level pipeline that will use all viable @@ -79,7 +81,9 @@ public ResourceManager getDefaultConfig() { USE_STANFORD_PARSE, USE_STANFORD_DEP, USE_SRL_VERB, USE_SRL_NOM, USE_SRL_PREP, USE_SRL_COMMA, USE_QUANTIFIER, USE_VERB_SENSE, USE_JSON, USE_RELATION, USE_LAZY_INITIALIZATION, USE_SRL_INTERNAL_PREPROCESSOR, SPLIT_ON_DASH, - USE_SENTENCE_PIPELINE, USE_TIMEX3, USE_MENTION, USE_TRANSLITERATION}; + USE_SENTENCE_PIPELINE, USE_TIMEX3, USE_MENTION, USE_TRANSLITERATION, + USE_DATALESS_ESA, USE_DATALESS_W2V}; + return (new AnnotatorServiceConfigurator().getConfig(new ResourceManager( generateProperties(properties)))); } diff --git a/pipeline/src/main/java/edu/illinois/cs/cogcomp/pipeline/main/PipelineFactory.java b/pipeline/src/main/java/edu/illinois/cs/cogcomp/pipeline/main/PipelineFactory.java index 544e22141..555394463 100644 --- a/pipeline/src/main/java/edu/illinois/cs/cogcomp/pipeline/main/PipelineFactory.java +++ b/pipeline/src/main/java/edu/illinois/cs/cogcomp/pipeline/main/PipelineFactory.java @@ -14,6 +14,10 @@ import edu.illinois.cs.cogcomp.core.datastructures.ViewNames; import edu.illinois.cs.cogcomp.core.utilities.configuration.Configurator; import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager; +import edu.illinois.cs.cogcomp.datalessclassification.config.ESADatalessConfigurator; +import edu.illinois.cs.cogcomp.datalessclassification.config.W2VDatalessConfigurator; +import edu.illinois.cs.cogcomp.datalessclassification.ta.ESADatalessAnnotator; +import edu.illinois.cs.cogcomp.datalessclassification.ta.W2VDatalessAnnotator; import edu.illinois.cs.cogcomp.depparse.DepAnnotator; import edu.illinois.cs.cogcomp.ner.NERAnnotator; import edu.illinois.cs.cogcomp.ner.NerAnnotatorManager; @@ -136,6 +140,12 @@ public static BasicAnnotatorService buildPipeline(Boolean disableCache, String.. case ViewNames.RELATION: nonDefaultValues.put(PipelineConfigurator.USE_RELATION.key, Configurator.TRUE); + case ViewNames.DATALESS_ESA: + nonDefaultValues.put(PipelineConfigurator.USE_DATALESS_ESA.key, + Configurator.TRUE); + case ViewNames.DATALESS_W2V: + nonDefaultValues.put(PipelineConfigurator.USE_DATALESS_W2V.key, + Configurator.TRUE); break; default: logger.warn("View name " @@ -387,6 +397,16 @@ private static Map buildAnnotators(ResourceManager nonDefault TemporalChunkerAnnotator tca = new TemporalChunkerAnnotator(new ResourceManager(rmProps)); viewGenerators.put(ViewNames.TIMEX3, tca); } + if (rm.getBoolean(PipelineConfigurator.USE_DATALESS_ESA)){ + rm = new ESADatalessConfigurator().getConfig(nonDefaultRm); + ESADatalessAnnotator esaDataless = new ESADatalessAnnotator(rm); + viewGenerators.put(ViewNames.DATALESS_ESA, esaDataless); + } + if (rm.getBoolean(PipelineConfigurator.USE_DATALESS_W2V)){ + rm = new W2VDatalessConfigurator().getConfig(nonDefaultRm); + W2VDatalessAnnotator w2vDataless = new W2VDatalessAnnotator(rm); + viewGenerators.put(ViewNames.DATALESS_W2V, w2vDataless); + } return viewGenerators; } diff --git a/pom.xml b/pom.xml index a7a72b890..bd8be6c3a 100644 --- a/pom.xml +++ b/pom.xml @@ -32,6 +32,7 @@ prepsrl verbsense pipeline + dataless-classifier external/external-commons external/clausie