|
FreeLing
3.0
|
00001 00002 // 00003 // FreeLing - Open Source Language Analyzers 00004 // 00005 // Copyright (C) 2004 TALP Research Center 00006 // Universitat Politecnica de Catalunya 00007 // 00008 // This library is free software; you can redistribute it and/or 00009 // modify it under the terms of the GNU General Public 00010 // License as published by the Free Software Foundation; either 00011 // version 3 of the License, or (at your option) any later version. 00012 // 00013 // This library is distributed in the hope that it will be useful, 00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU General Public 00019 // License along with this library; if not, write to the Free Software 00020 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 00021 // 00022 // contact: Lluis Padro (padro@lsi.upc.es) 00023 // TALP Research Center 00024 // despatx C6.212 - Campus Nord UPC 00025 // 08034 Barcelona. SPAIN 00026 // 00028 00029 #ifndef _LANGUAGE 00030 #define _LANGUAGE 00031 00032 #include <string> 00033 #include <list> 00034 #include <vector> 00035 #include <set> 00036 #include <map> 00037 #include <boost/regex/icu.hpp> 00038 00039 #include "freeling/windll.h" 00040 #include "freeling/tree.h" 00041 00042 class word; // predeclaration 00043 00048 00049 class WINDLL analysis { 00050 00051 private: 00053 std::wstring lemma; 00055 std::wstring tag; 00057 double prob; 00059 double distance; 00061 std::list<std::pair<std::wstring,double> > senses; 00063 std::list<word> retok; 00064 00065 // cache for short versions of Eagles tags; 00066 static std::map<std::wstring,std::wstring> stag_cache; 00067 00068 std::wstring shorten_tag(const std::wstring &s=L"V") const; 00069 00070 // store which sequences --among the kbest proposed by 00071 // the tagger-- contain this analysis 00072 std::set<int> selected_kbest; 00073 00074 public: 00076 std::vector<std::wstring> user; 00077 00079 analysis(); 00081 analysis(const std::wstring &, const std::wstring &); 00083 analysis& operator=(const analysis&); 00084 00085 void set_lemma(const std::wstring &); 00086 void set_tag(const std::wstring &); 00087 void set_prob(double); 00088 void set_distance(double); 00089 void set_retokenizable(const std::list<word> &); 00090 00091 bool has_prob() const; 00092 bool has_distance() const; 00093 std::wstring get_lemma() const; 00094 std::wstring get_tag() const; 00095 std::wstring get_short_tag() const; 00096 std::wstring get_short_tag(const std::wstring &) const; 00097 double get_prob() const; 00098 double get_distance() const; 00099 bool is_retokenizable() const; 00100 std::list<word> get_retokenizable() const; 00101 00102 std::list<std::pair<std::wstring,double> > get_senses() const; 00103 void set_senses(const std::list<std::pair<std::wstring,double> > &); 00104 // useful for java API 00105 std::wstring get_senses_string() const; 00106 00107 // get the largest kbest sequence index the analysis is selected in. 00108 int max_kbest() const; 00109 // find out whether the analysis is selected in the tagger k-th best sequence 00110 bool is_selected(int k=0) const; 00111 // mark this analysis as selected in k-th best sequence 00112 void mark_selected(int k=0); 00113 // unmark this analysis as selected in k-th best sequence 00114 void unmark_selected(int k=0); 00115 00117 bool operator<(const analysis &) const; 00119 bool operator==(const analysis &) const; 00120 }; 00121 00122 00127 00128 class WINDLL word : public std::list<analysis> { 00129 private: 00131 std::wstring form; 00133 std::wstring lc_form; 00135 std::wstring ph_form; 00137 std::list<word> multiword; 00139 bool ambiguous_mw; 00141 std::list<std::pair<word,double> > alternatives; 00143 unsigned long start, finish; 00145 bool in_dict; 00147 bool locked; 00149 void clone(const word &); 00150 00152 static const int SELECTED=0; 00153 static const int UNSELECTED=1; 00154 static const int ALL=2; 00155 00156 public: 00157 // predeclarations 00158 class iterator; 00159 class const_iterator; 00160 00162 std::vector<std::wstring> user; 00163 00165 word(); 00167 word(const std::wstring &); 00169 word(const std::wstring &, const std::list<word> &); 00171 word(const std::wstring &, const std::list<analysis> &, const std::list<word> &); 00173 word(const word &); 00175 word& operator=(const word&); 00176 00178 void copy_analysis(const word &); 00180 int get_n_selected(int k=0) const; 00182 int get_n_unselected(int k=0) const; 00184 bool is_multiword() const; 00186 bool is_ambiguous_mw() const; 00188 void set_ambiguous_mw(bool); 00190 int get_n_words_mw() const; 00192 std::list<word> get_words_mw() const; 00194 std::wstring get_form() const; 00196 std::wstring get_lc_form() const; 00198 std::wstring get_ph_form() const; 00200 word::iterator selected_begin(int k=0); 00202 word::const_iterator selected_begin(int k=0) const; 00204 word::iterator selected_end(int k=0); 00206 word::const_iterator selected_end(int k=0) const; 00208 word::iterator unselected_begin(int k=0); 00210 word::const_iterator unselected_begin(int k=0) const; 00212 word::iterator unselected_end(int k=0); 00214 word::const_iterator unselected_end(int k=0) const; 00216 unsigned int num_kbest() const; 00218 std::wstring get_lemma(int k=0) const; 00220 std::wstring get_tag(int k=0) const; 00222 std::wstring get_short_tag(int k=0) const; 00224 std::wstring get_short_tag(const std::wstring &,int k=0) const; 00225 00227 std::list<std::pair<std::wstring,double> > get_senses(int k=0) const; 00228 // useful for java API 00229 std::wstring get_senses_string(int k=0) const; 00231 void set_senses(const std::list<std::pair<std::wstring,double> > &, int k=0); 00232 00234 unsigned long get_span_start() const; 00235 unsigned long get_span_finish() const; 00236 00238 bool found_in_dict() const; 00240 void set_found_in_dict(bool); 00242 bool has_retokenizable() const; 00244 void lock_analysis(); 00246 bool is_locked() const; 00247 00249 void add_alternative(const word &, double); 00251 void set_alternatives(const std::list<std::pair<word,double> > &); 00253 bool has_alternatives() const; 00255 std::list<std::pair<word,double> > get_alternatives() const; 00257 std::list<std::pair<word,double> >::iterator alternatives_begin(); 00259 std::list<std::pair<word,double> >::iterator alternatives_end(); 00260 00262 void add_analysis(const analysis &); 00264 void set_analysis(const analysis &); 00266 void set_analysis(const std::list<analysis> &); 00268 void set_form(const std::wstring &); 00270 void set_ph_form(const std::wstring &); 00272 void set_span(unsigned long, unsigned long); 00273 00275 bool find_tag_match(boost::u32regex &); 00276 00278 int get_n_analysis() const; 00280 void unselect_all_analysis(int k=0); 00282 void select_all_analysis(int k=0); 00284 void select_analysis(word::iterator, int k=0); 00286 void unselect_analysis(word::iterator, int k=0); 00288 std::list<analysis> get_analysis() const; 00290 word::iterator analysis_begin(); 00291 word::const_iterator analysis_begin() const; 00293 word::iterator analysis_end(); 00294 word::const_iterator analysis_end() const; 00295 00297 class WINDLL iterator : public std::list<analysis>::iterator { 00298 friend class word::const_iterator; 00299 private: 00301 std::list<analysis>::iterator ibeg; 00303 std::list<analysis>::iterator iend; 00305 int type; 00307 int kbest; 00308 00309 public: 00311 iterator(); 00313 iterator(const word::iterator &); 00315 iterator(const std::list<analysis>::iterator &); 00317 iterator(const std::list<analysis>::iterator &, 00318 const std::list<analysis>::iterator &, 00319 const std::list<analysis>::iterator &, int,int k=0); 00321 iterator& operator++(); 00322 iterator operator++(int); 00323 }; 00324 00326 class WINDLL const_iterator : public std::list<analysis>::const_iterator { 00327 private: 00329 std::list<analysis>::const_iterator ibeg; 00331 std::list<analysis>::const_iterator iend; 00333 int type; 00335 int kbest; 00336 00337 public: 00339 const_iterator(); 00341 const_iterator(const word::const_iterator &); 00343 const_iterator(const word::iterator &); 00345 const_iterator(const std::list<analysis>::const_iterator &); 00347 const_iterator(const std::list<analysis>::iterator &); 00349 const_iterator(const std::list<analysis>::const_iterator &, 00350 const std::list<analysis>::const_iterator &, 00351 const std::list<analysis>::const_iterator &, int, int k=0); 00353 const_iterator& operator++(); 00354 const_iterator operator++(int); 00355 }; 00356 00357 }; 00358 00359 00360 00366 00367 class WINDLL node { 00368 protected: 00370 std::wstring nodeid; 00372 bool head; 00374 int chunk; 00376 std::wstring label; 00378 word * w; 00379 00380 public: 00382 std::vector<std::wstring> user; 00383 00385 node(); 00386 node(const std::wstring &); 00388 std::wstring get_node_id() const; 00390 void set_node_id(const std::wstring &); 00392 std::wstring get_label() const; 00394 word get_word() const; 00396 word& get_word(); 00398 void set_label(const std::wstring &); 00400 void set_word(word &); 00402 bool is_head() const; 00404 void set_head(const bool); 00406 bool is_chunk() const; 00408 void set_chunk(const int); 00410 int get_chunk_ord() const; 00411 00412 }; 00413 00417 00418 class WINDLL parse_tree : public tree<node> { 00419 private: 00420 std::map<std::wstring,parse_tree::iterator> node_index; 00421 00422 public: 00423 parse_tree(); 00424 parse_tree(parse_tree::iterator p); 00425 parse_tree(const node &); 00426 00428 void build_node_index(); 00430 void rebuild_node_index(); 00432 parse_tree::iterator get_node_by_id(std::wstring) const; 00433 00434 static int nsentence; 00435 }; 00436 00437 00442 00443 class WINDLL depnode : public node { 00444 00445 private: 00447 parse_tree::iterator link; 00448 00449 public: 00450 depnode(); 00451 depnode(const std::wstring &); 00452 depnode(const node &); 00453 void set_link(const parse_tree::iterator); 00454 parse_tree::iterator get_link(); 00455 parse_tree::const_iterator get_link() const; 00457 tree<node>& get_link_ref(); 00458 }; 00459 00460 00461 00465 00466 class WINDLL dep_tree : public tree<depnode> { 00467 public: 00468 dep_tree(); 00469 dep_tree(const depnode &); 00470 }; 00471 00472 00478 00479 class WINDLL sentence : public std::list<word> { 00480 private: 00481 // vector with pointers to sentence words, for fast access by position 00482 std::vector<word*> wpos; 00483 // parse tree (if sentence parsed) 00484 std::map<int,parse_tree> pts; 00485 // dependencey tree (if sentence dep. parsed) 00486 std::map<int,dep_tree> dts; 00488 void clone(const sentence &); 00489 00490 public: 00491 sentence(); 00492 sentence(const std::list<word>&); 00494 sentence(const sentence &); 00496 sentence& operator=(const sentence&); 00498 const word& operator[](size_t) const; 00499 word& operator[](size_t); 00501 unsigned int num_kbest() const; 00503 void push_back(const word &); 00505 void rebuild_word_index(); 00506 00507 void clear(); 00508 00509 void set_parse_tree(const parse_tree &, int k=0); 00510 parse_tree & get_parse_tree(int k=0); 00511 const parse_tree & get_parse_tree(int k=0) const; 00512 bool is_parsed() const; 00513 00514 void set_dep_tree(const dep_tree &, int k=0); 00515 dep_tree & get_dep_tree(int k=0); 00516 const dep_tree & get_dep_tree(int k=0) const; 00517 bool is_dep_parsed() const; 00518 00520 std::vector<word> get_words() const; 00522 sentence::iterator words_begin(); 00523 sentence::const_iterator words_begin() const; 00524 sentence::iterator words_end(); 00525 sentence::const_iterator words_end() const; 00526 }; 00527 00532 00533 class WINDLL paragraph : public std::list<sentence> { 00534 public: 00535 paragraph() {} 00536 virtual ~paragraph() {} 00537 }; 00538 00543 00544 class WINDLL document : public std::list<paragraph> { 00545 00546 private: 00547 paragraph title; 00548 std::multimap<int,std::wstring> group2node; 00549 std::map<std::wstring,int> node2group; 00550 int last_group; 00551 00552 public: 00553 document(); 00555 void add_positive(std::wstring node1, int group1); 00557 void add_positive(std::wstring,std::wstring); 00559 int get_coref_group(std::wstring) const; 00561 std::list<std::wstring> get_coref_nodes(int) const; 00563 bool is_coref(std::wstring,std::wstring) const; 00564 }; 00565 00566 00567 #endif 00568
1.7.6.1