source: annotrans/trunk/src/main/java/nl/mpi/annot/translate/Trellis.java @ 6979

Last change on this file since 6979 was 6979, checked in by peter.beinema@mpi.nl, 8 years ago

first upload

File size: 5.5 KB
Line 
1/*
2 * To change this license header, choose License Headers in Project Properties.
3 * To change this template file, choose Tools | Templates
4 * and open the template in the editor.
5 */
6package nl.mpi.annot.translate;
7
8import java.util.ArrayList;
9
10/**
11 *
12 * @author petbei
13 * A Trellis (or this Trellis anyway) is a data structure to store words, 1 character per layer.
14 * At the top layer the 'root' node is the start of a linked list of alphabetically sorted TrellisNodes, each pointing to the next list element via the 'next' pointer:
15 *         .---.         .---.
16 * root -> | a | -next-> | f | -next-> NULL
17 *         '---'         '---'
18 * (so, in this example words starting with 'a' and 'f' are defined.)
19 *
20 * Each node has a 'son' pointer to a (alphabetically sorted) linked list of TrellisNodes that define the continuation of the word:
21 *
22 *         .---.         .---.
23 * root -> | a | -next-> | f | -next-> NULL
24 *         '---'         '---'
25 *           |
26 *          son
27 *           |
28 *           V
29 *         .---.         .---.
30 *         | r | -next-> | s | -next-> NULL
31 *         '---'         '---'
32 *
33 * so that the word (starts) 'ar', 'as' and 'f' are defined.
34 *
35 * This trellis is used to store translation pairs for words or word segments. At the appropriate position in the trellis a list of 'targets' can be defined,
36 * that signifies that the "source" word in the trellis that ends here can be seen as a complete unit, and has translations.
37 * NB: the specific position in the Trellis can still have 'next siblings and 'son's.
38 */
39
40public class Trellis {
41    private TrellisNode root;   //--- pointer to the top level leftmost Trellis node. Initially null/undefined.
42    public Trellis() {
43        // System.out.println("Translation Trellis created");
44        root = null;
45    }
46   
47    //--- the trellis is built up using source/target String pairs. Adding nodes is done recursively, 1 source letter at the time.
48    public void addTranslationPair(String src, String tgt) {
49        //--- assertion: src and tgt are neither null nor empty. XML files are currently used to define them, and the nodes attributes are checked for validity during parsing.
50        if (root == null && src.length() > 0) {
51            //--- first node of trellis is defined here, using the first letter of the source string.
52            root = new TrellisNode(src.charAt(0));
53        }
54        //--- a full source word is added recursively, starting at the first character of the source
55        root = root.addTranslation(src, tgt);
56    }
57   
58    ArrayList<String> translate(String source) {
59        ArrayList<ArrayList<String>> inter = new ArrayList<ArrayList<String>>();    // intermediate translation results: for each segment recognised an array of Strings.
60        if (root == null || source.length() <= 0) {
61            return new ArrayList<String>();
62        }
63        //=== perform translation per source segment.
64        // the TrellisNode 'translate' method
65        //     - tells how large the segment is that it recognised
66        //     - adds an array of partal translations to the intermediate results
67        // If nothing is recognised, the current character is skipped (not translated), and work continues with the next position
68        int offset = 0;
69        while (offset < source.length()) {
70            int processed = root.translate(source.substring(offset), inter);
71            if (processed > 0) {
72                offset = offset + processed;
73            }
74            else {
75                System.err.println("Segmented translation of " + source + ": position " + offset + " (" + source.charAt(offset) + ") skipped");
76                offset = offset + 1;
77            }
78        }
79       
80        //=== now, the partial results must be combined. This is done in an inefficient but simple way.
81        ArrayList<String> final1 = new ArrayList<String>();
82        ArrayList<String> final2 = new ArrayList<String>();
83       
84        for (ArrayList<String> al: inter) {
85            if (final2.size() == 0) {
86                //--- no final results yet, start with copying the initial segment's targts to the work-in-progress final1 list
87                for (String s: al) {
88                    final1.add(s);
89                }
90            }
91            else {
92                //--- we have already partial results. They must be combined with the next segment's targets.
93                for (String s1: final2) {
94                    for (String s2: al) {
95                        final1.add(s1 + s2);
96                    }
97                }
98            }
99            //--- move temporary results to input for the next round, reset temporary results
100            final2 = final1;
101            final1 = new ArrayList<String>();
102        }
103       
104        //=== so now the results (if any) are in final2
105        //for (String f: final2) {
106        //    System.out.println("result \"" + f + "\"");
107        //}
108        //=== move to a real array
109        //String[] result = new String[final2.size()];
110        //for (int i = 0; i < final2.size(); i++) {
111        //    result[i] = final2.get(i);
112        //}
113        return final2;
114    }
115   
116    //--- show contents of trellis
117    public void display() {
118        System.out.println("===== START Trellis =====");
119        if (root == null) {
120            System.out.println("    no root."); //--- empty trellis
121        }
122        else {
123            root.display(1);    //--- recursive display of nodes; integer parameter indicates indentation level.
124        }
125        System.out.println("===== END Trellis =====\n");
126    }
127}
Note: See TracBrowser for help on using the repository browser.