1 | /* |
---|
2 | * To change this license header, choose License Headers in Project Properties. |
---|
3 | * To change this template file, choose Tools | Templates |
---|
4 | * and open the template in the editor. |
---|
5 | */ |
---|
6 | package nl.mpi.annot.translate; |
---|
7 | |
---|
8 | import java.util.ArrayList; |
---|
9 | |
---|
10 | /** |
---|
11 | * |
---|
12 | * @author petbei |
---|
13 | * A Trellis (or this Trellis anyway) is a data structure to store words, 1 character per layer. |
---|
14 | * At the top layer the 'root' node is the start of a linked list of alphabetically sorted TrellisNodes, each pointing to the next list element via the 'next' pointer: |
---|
15 | * .---. .---. |
---|
16 | * root -> | a | -next-> | f | -next-> NULL |
---|
17 | * '---' '---' |
---|
18 | * (so, in this example words starting with 'a' and 'f' are defined.) |
---|
19 | * |
---|
20 | * Each node has a 'son' pointer to a (alphabetically sorted) linked list of TrellisNodes that define the continuation of the word: |
---|
21 | * |
---|
22 | * .---. .---. |
---|
23 | * root -> | a | -next-> | f | -next-> NULL |
---|
24 | * '---' '---' |
---|
25 | * | |
---|
26 | * son |
---|
27 | * | |
---|
28 | * V |
---|
29 | * .---. .---. |
---|
30 | * | r | -next-> | s | -next-> NULL |
---|
31 | * '---' '---' |
---|
32 | * |
---|
33 | * so that the word (starts) 'ar', 'as' and 'f' are defined. |
---|
34 | * |
---|
35 | * This trellis is used to store translation pairs for words or word segments. At the appropriate position in the trellis a list of 'targets' can be defined, |
---|
36 | * that signifies that the "source" word in the trellis that ends here can be seen as a complete unit, and has translations. |
---|
37 | * NB: the specific position in the Trellis can still have 'next siblings and 'son's. |
---|
38 | */ |
---|
39 | |
---|
40 | public class Trellis { |
---|
41 | private TrellisNode root; //--- pointer to the top level leftmost Trellis node. Initially null/undefined. |
---|
42 | public Trellis() { |
---|
43 | // System.out.println("Translation Trellis created"); |
---|
44 | root = null; |
---|
45 | } |
---|
46 | |
---|
47 | //--- the trellis is built up using source/target String pairs. Adding nodes is done recursively, 1 source letter at the time. |
---|
48 | public void addTranslationPair(String src, String tgt) { |
---|
49 | //--- assertion: src and tgt are neither null nor empty. XML files are currently used to define them, and the nodes attributes are checked for validity during parsing. |
---|
50 | if (root == null && src.length() > 0) { |
---|
51 | //--- first node of trellis is defined here, using the first letter of the source string. |
---|
52 | root = new TrellisNode(src.charAt(0)); |
---|
53 | } |
---|
54 | //--- a full source word is added recursively, starting at the first character of the source |
---|
55 | root = root.addTranslation(src, tgt); |
---|
56 | } |
---|
57 | |
---|
58 | ArrayList<String> translate(String source) { |
---|
59 | ArrayList<ArrayList<String>> inter = new ArrayList<ArrayList<String>>(); // intermediate translation results: for each segment recognised an array of Strings. |
---|
60 | if (root == null || source.length() <= 0) { |
---|
61 | return new ArrayList<String>(); |
---|
62 | } |
---|
63 | //=== perform translation per source segment. |
---|
64 | // the TrellisNode 'translate' method |
---|
65 | // - tells how large the segment is that it recognised |
---|
66 | // - adds an array of partal translations to the intermediate results |
---|
67 | // If nothing is recognised, the current character is skipped (not translated), and work continues with the next position |
---|
68 | int offset = 0; |
---|
69 | while (offset < source.length()) { |
---|
70 | int processed = root.translate(source.substring(offset), inter); |
---|
71 | if (processed > 0) { |
---|
72 | offset = offset + processed; |
---|
73 | } |
---|
74 | else { |
---|
75 | System.err.println("Segmented translation of " + source + ": position " + offset + " (" + source.charAt(offset) + ") skipped"); |
---|
76 | offset = offset + 1; |
---|
77 | } |
---|
78 | } |
---|
79 | |
---|
80 | //=== now, the partial results must be combined. This is done in an inefficient but simple way. |
---|
81 | ArrayList<String> final1 = new ArrayList<String>(); |
---|
82 | ArrayList<String> final2 = new ArrayList<String>(); |
---|
83 | |
---|
84 | for (ArrayList<String> al: inter) { |
---|
85 | if (final2.size() == 0) { |
---|
86 | //--- no final results yet, start with copying the initial segment's targts to the work-in-progress final1 list |
---|
87 | for (String s: al) { |
---|
88 | final1.add(s); |
---|
89 | } |
---|
90 | } |
---|
91 | else { |
---|
92 | //--- we have already partial results. They must be combined with the next segment's targets. |
---|
93 | for (String s1: final2) { |
---|
94 | for (String s2: al) { |
---|
95 | final1.add(s1 + s2); |
---|
96 | } |
---|
97 | } |
---|
98 | } |
---|
99 | //--- move temporary results to input for the next round, reset temporary results |
---|
100 | final2 = final1; |
---|
101 | final1 = new ArrayList<String>(); |
---|
102 | } |
---|
103 | |
---|
104 | //=== so now the results (if any) are in final2 |
---|
105 | //for (String f: final2) { |
---|
106 | // System.out.println("result \"" + f + "\""); |
---|
107 | //} |
---|
108 | //=== move to a real array |
---|
109 | //String[] result = new String[final2.size()]; |
---|
110 | //for (int i = 0; i < final2.size(); i++) { |
---|
111 | // result[i] = final2.get(i); |
---|
112 | //} |
---|
113 | return final2; |
---|
114 | } |
---|
115 | |
---|
116 | //--- show contents of trellis |
---|
117 | public void display() { |
---|
118 | System.out.println("===== START Trellis ====="); |
---|
119 | if (root == null) { |
---|
120 | System.out.println(" no root."); //--- empty trellis |
---|
121 | } |
---|
122 | else { |
---|
123 | root.display(1); //--- recursive display of nodes; integer parameter indicates indentation level. |
---|
124 | } |
---|
125 | System.out.println("===== END Trellis =====\n"); |
---|
126 | } |
---|
127 | } |
---|