source: vlo/trunk/vlo-importer/src/main/java/eu/clarin/cmdi/vlo/importer/ResourceStructureGraph.java @ 6698

Last change on this file since 6698 was 6698, checked in by teckart@informatik.uni-leipzig.de, 9 years ago

Added new helper methods and more consistent handling of normalized vertex IDs

File size: 12.6 KB
Line 
1/*
2 * Copyright (C) 2015 CLARIN
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 */
17package eu.clarin.cmdi.vlo.importer;
18
19import eu.clarin.cmdi.vlo.StringUtils;
20import java.util.ArrayList;
21import java.util.HashMap;
22import java.util.HashSet;
23import java.util.Iterator;
24import java.util.List;
25import java.util.Map;
26import java.util.Set;
27import org.jgrapht.experimental.dag.DirectedAcyclicGraph;
28import org.jgrapht.graph.DefaultEdge;
29import org.slf4j.Logger;
30import org.slf4j.LoggerFactory;
31
32/**
33 * Stores relationship graph between CMDI files based on ResourceProxys.
34 * Generates weight for every file based on its position in the hierarchy
35 * (weight = number of edges to a top node)
36 *
37 * @author Thomas Eckart
38 */
39public class ResourceStructureGraph {
40
41    protected final static Logger LOG = LoggerFactory.getLogger(ResourceStructureGraph.class);
42
43    private static DirectedAcyclicGraph<CmdiVertex, DefaultEdge> graph = new DirectedAcyclicGraph<CmdiVertex, DefaultEdge>(
44            DefaultEdge.class);
45    private static Map<String, CmdiVertex> vertexIdMap = new HashMap<String, CmdiVertex>();
46    private static Set<CmdiVertex> foundVerticesSet = new HashSet<CmdiVertex>();
47
48    /**
49     * Adds new vertex to graph, used to remember all CMDI files that were
50     * actually seen
51     *
52     * @param mdSelfLink extracted MdSelfLink from CMDI file
53     */
54    public static void addResource(String mdSelfLink) {
55        String normalizedMdSelfLink = StringUtils.normalizeIdString(mdSelfLink);
56        if (!vertexIdMap.containsKey(normalizedMdSelfLink)) {
57            CmdiVertex newVertex = new CmdiVertex(normalizedMdSelfLink);
58            vertexIdMap.put(normalizedMdSelfLink, newVertex);
59        }
60
61        if (!foundVerticesSet.contains(vertexIdMap.get(normalizedMdSelfLink))) {
62            graph.addVertex(vertexIdMap.get(normalizedMdSelfLink));
63            foundVerticesSet.add(vertexIdMap.get(normalizedMdSelfLink));
64        } else {
65            LOG.debug("Duplicate resource vertex mdSelfLink: " + mdSelfLink);
66        }
67    }
68
69    /**
70     * Add new edge to graph
71     *
72     * @param sourceVertexId source vertex ID (=isPart)
73     * @param targetVertexId target vertex ID (=hasPart)
74     */
75    public static void addEdge(String sourceVertexId, String targetVertexId) {
76        String normalizedSourceVertexId = StringUtils.normalizeIdString(sourceVertexId);
77        String normalizedTargetVertexId = StringUtils.normalizeIdString(targetVertexId);
78       
79        // add vertices
80        if (!vertexIdMap.containsKey(normalizedSourceVertexId)) {
81            CmdiVertex sourceVertex = new CmdiVertex(normalizedSourceVertexId);
82            vertexIdMap.put(normalizedSourceVertexId, sourceVertex);
83            graph.addVertex(sourceVertex);
84        }
85
86        if (!vertexIdMap.containsKey(normalizedTargetVertexId)) {
87            CmdiVertex targetVertex = new CmdiVertex(normalizedTargetVertexId);
88            vertexIdMap.put(normalizedTargetVertexId, targetVertex);
89            graph.addVertex(targetVertex);
90        }
91
92        // add edge
93        try {
94            graph.addEdge(vertexIdMap.get(normalizedSourceVertexId), vertexIdMap.get(normalizedTargetVertexId));
95            updateDepthValues(vertexIdMap.get(normalizedSourceVertexId), new HashSet<CmdiVertex>());
96        } catch (IllegalArgumentException cfe) {
97            // was a cycle -> ignore
98            LOG.debug("Found cycle\t" + sourceVertexId + "\t" + targetVertexId);
99        }
100    }
101
102    /**
103     * Update depth weights for startVertex and related vertices (in both
104     * hierarchy directions)
105     *
106     * @param startVertex
107     * @param alreadySeenVerticesSet set of already seen vertices to avoid
108     * infinite loops for cycles
109     */
110    private static void updateDepthValues(CmdiVertex startVertex, Set<CmdiVertex> alreadySeenVerticesSet) {
111        alreadySeenVerticesSet = new HashSet<CmdiVertex>();
112        alreadySeenVerticesSet.add(startVertex);
113
114        // upwards, is part of other resource -> use decremented minimal value
115        Set<DefaultEdge> outgoingEdgeSet = graph.outgoingEdgesOf(startVertex);
116        if (!outgoingEdgeSet.isEmpty()) {
117            Iterator<DefaultEdge> edgeIter = outgoingEdgeSet.iterator();
118            CmdiVertex maxVertex = new CmdiVertex("DUMMY");
119            maxVertex.setHierarchyWeight(1);
120            while (edgeIter.hasNext()) {
121                CmdiVertex targetVertex = graph.getEdgeTarget(edgeIter.next());
122                if (targetVertex.getHierarchyWeight() <= maxVertex.getHierarchyWeight()) {
123                    maxVertex = targetVertex;
124                }
125            }
126            if (maxVertex.getHierarchyWeight() <= startVertex.getHierarchyWeight()) {
127                LOG.debug("UP UPDATE  \t" + startVertex.getId() + "\t" + maxVertex.getId() + "\t"
128                        + startVertex.getHierarchyWeight() + " --> " + (maxVertex.getHierarchyWeight() - 1));
129                alreadySeenVerticesSet.add(maxVertex);
130                startVertex.setHierarchyWeight(maxVertex.getHierarchyWeight() - 1);
131            }
132        }
133
134        // downwards, has other resources as part -> decrement their depth value if smaller
135        Set<DefaultEdge> incomingEdgeSet = graph.incomingEdgesOf(startVertex);
136        if (!incomingEdgeSet.isEmpty()) {
137            Iterator<DefaultEdge> edgeIter = incomingEdgeSet.iterator();
138            while (edgeIter.hasNext()) {
139                backwardUpdate(graph.getEdgeSource(edgeIter.next()), startVertex.getHierarchyWeight() - 1, alreadySeenVerticesSet);
140            }
141        }
142    }
143
144    private static void backwardUpdate(CmdiVertex updateVertex, int newDepth, Set<CmdiVertex> alreadySeenVerticesSet) {
145        if (!alreadySeenVerticesSet.contains(updateVertex) && updateVertex.getHierarchyWeight() > newDepth) {
146            alreadySeenVerticesSet.add(updateVertex);
147            LOG.debug("DOWN UPDATE\t" + updateVertex.getId() + "\t" + updateVertex.getHierarchyWeight() + " --> "
148                    + newDepth);
149            updateVertex.setHierarchyWeight(newDepth);
150
151            Set<DefaultEdge> incomingEdgeSet = graph.incomingEdgesOf(updateVertex);
152            if (!incomingEdgeSet.isEmpty()) {
153                Iterator<DefaultEdge> edgeIter = incomingEdgeSet.iterator();
154                while (edgeIter.hasNext()) {
155                    backwardUpdate(graph.getEdgeSource(edgeIter.next()), newDepth - 1, alreadySeenVerticesSet);
156                }
157            }
158        }
159    }
160
161    public static DirectedAcyclicGraph<CmdiVertex, DefaultEdge> getResourceGraph() {
162        return graph;
163    }
164
165    public static Set<CmdiVertex> getFoundVertices() {
166        return foundVerticesSet;
167    }
168   
169    public static CmdiVertex getVertex(String vertexId) {
170        return vertexIdMap.get(vertexId);       
171    }
172   
173    public static Map<String, CmdiVertex> getVertexIdMap() {
174        return vertexIdMap;
175    }
176
177    /**
178     * Get all vertices that are source of an edge where targetVertex is target.
179     * In other words get all resource vertices that are part of resource targetVertex.
180     *
181     * @param targetVertex
182     * @return List of vertices that are source of an edge where targetVertex is
183     * target
184     */
185    public static List<String> getIncomingVertexNames(CmdiVertex targetVertex) {
186        List<String> vertexNamesList = new ArrayList<String>();
187        Set<DefaultEdge> incomingEdges = graph.incomingEdgesOf(targetVertex);
188        Iterator<DefaultEdge> edgeIter = incomingEdges.iterator();
189        while (edgeIter.hasNext()) {
190            DefaultEdge edge = edgeIter.next();
191            if (getFoundVertices().contains(graph.getEdgeSource(edge))) {
192                vertexNamesList.add(graph.getEdgeSource(edge).getId());
193            }
194        }
195
196        return vertexNamesList;
197    }
198
199    /**
200     * Get all vertices that are target of an edge where sourceVertex is source.
201     * In other words get all resource vertices of which resource sourceVertex is part of.
202     *
203     * @param sourceVertex
204     * @return List of vertices that are target of an edge where sourceVertex is
205     * source
206     */
207    public static List<String> getOutgoingVertexNames(CmdiVertex sourceVertex) {
208        List<String> vertexNamesList = new ArrayList<String>();
209        Set<DefaultEdge> outgoingEdges = graph.outgoingEdgesOf(sourceVertex);
210        Iterator<DefaultEdge> edgeIter = outgoingEdges.iterator();
211        while (edgeIter.hasNext()) {
212            DefaultEdge edge = edgeIter.next();
213            if (getFoundVertices().contains(graph.getEdgeTarget(edge))) {
214                vertexNamesList.add(graph.getEdgeTarget(edge).getId());
215            }
216        }
217
218        return vertexNamesList;
219    }
220
221    /**
222     * Reset resource hierarchy graph (= deleting vertices + edges)
223     */
224    public static void clearResourceGraph() {
225        vertexIdMap = new HashMap<String, CmdiVertex>();
226        foundVerticesSet = new HashSet<CmdiVertex>();
227        graph = new DirectedAcyclicGraph<CmdiVertex, DefaultEdge>(DefaultEdge.class);
228    }
229
230    /**
231     * Get some statistics about the resource hierarchy graph
232     *
233     * @param maxBrokenEdges maximal number of examples of broken edges (=source
234     * or target vertex is missing) included in the output
235     * @return some statistics about the resource hierarchy graph
236     */
237    public static String printStatistics(int maxBrokenEdges) {
238        StringBuilder sb = new StringBuilder();
239
240        // vertex + edge sets size
241        sb.append("vertices: ").append(foundVerticesSet.size()).append(", edges: ").append(graph.edgeSet().size());
242
243        // count broken + valid edges
244        int count = 0;
245        Iterator<DefaultEdge> edgeIter = graph.edgeSet().iterator();
246        HashSet<DefaultEdge> brokenEdgeSet = new HashSet<DefaultEdge>();
247        while (edgeIter.hasNext()) {
248            DefaultEdge edge = edgeIter.next();
249            if (foundVerticesSet.contains(graph.getEdgeTarget(edge)) && foundVerticesSet.contains(graph.getEdgeSource(edge))) {
250                count++;
251            } else {
252                brokenEdgeSet.add(edge);
253            }
254        }
255        sb.append(", broken edges: ").append(brokenEdgeSet.size());
256
257        // show some broken edges
258        if (maxBrokenEdges != 0) {
259            int counter = 0;
260            Iterator<DefaultEdge> brokenEdgeIter = brokenEdgeSet.iterator();
261            while (brokenEdgeIter.hasNext()) {
262                if ((++counter) <= maxBrokenEdges) {
263                    DefaultEdge brokenEdge = brokenEdgeIter.next();
264                    LOG.debug("Broken edge: " + graph.getEdgeSource(brokenEdge) + " --> "
265                            + graph.getEdgeTarget(brokenEdge));
266                } else {
267                    break;
268                }
269            }
270        }
271
272        // valid edges
273        sb.append(", valid edges: ").append(count);
274        return sb.toString();
275    }
276}
277
278/**
279 * Stores all information of a vertex (=CMDI file) in the CMDI hierarchy graph
280 *
281 * @author Thomas Eckart
282 */
283class CmdiVertex {
284
285    private final String id;
286    private int hierarchyWeight = 0;
287    private boolean wasImported = false;
288
289    public CmdiVertex(String id) {
290        this.id = id;
291    }
292
293    public String getId() {
294        return id;
295    }
296
297    public void setHierarchyWeight(int hierarchyWeight) {
298        this.hierarchyWeight = hierarchyWeight;
299    }
300
301    public int getHierarchyWeight() {
302        return hierarchyWeight;
303    }
304   
305    public void setWasImported(boolean wasImported) {
306        this.wasImported = wasImported;
307    }
308   
309    public boolean getWasImported() {
310        return wasImported;
311    }
312
313    @Override
314    public String toString() {
315        return id + " (" + hierarchyWeight + ")";
316    }
317
318    @Override
319    public boolean equals(Object obj) {
320        if (this == obj) {
321            return true;
322        }
323        if (obj == null) {
324            return false;
325        }
326        if (!(obj instanceof CmdiVertex)) {
327            return false;
328        }
329        CmdiVertex other = (CmdiVertex) obj;
330        if (this.id.equals(other.id)) {
331            return true;
332        }
333        if (this.id == null && other.id != null) {
334            return false;
335        }
336        return this.id.equals(other.id);
337    }
338
339    @Override
340    public int hashCode() {
341        return id.hashCode();
342    }
343}
Note: See TracBrowser for help on using the repository browser.