FreeLing  3.0
language.h
Go to the documentation of this file.
00001 
00002 //
00003 //    FreeLing - Open Source Language Analyzers
00004 //
00005 //    Copyright (C) 2004   TALP Research Center
00006 //                         Universitat Politecnica de Catalunya
00007 //
00008 //    This library is free software; you can redistribute it and/or
00009 //    modify it under the terms of the GNU General Public
00010 //    License as published by the Free Software Foundation; either
00011 //    version 3 of the License, or (at your option) any later version.
00012 //
00013 //    This library is distributed in the hope that it will be useful,
00014 //    but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016 //    General Public License for more details.
00017 //
00018 //    You should have received a copy of the GNU General Public
00019 //    License along with this library; if not, write to the Free Software
00020 //    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00021 //
00022 //    contact: Lluis Padro (padro@lsi.upc.es)
00023 //             TALP Research Center
00024 //             despatx C6.212 - Campus Nord UPC
00025 //             08034 Barcelona.  SPAIN
00026 //
00028 
00029 #ifndef _LANGUAGE
00030 #define _LANGUAGE
00031 
00032 #include <string>
00033 #include <list>
00034 #include <vector>
00035 #include <set>
00036 #include <map>
00037 #include <boost/regex/icu.hpp>
00038 
00039 #include "freeling/windll.h"
00040 #include "freeling/tree.h"
00041 
00042 class word; // predeclaration
00043 
00048 
00049 class WINDLL analysis {
00050 
00051  private:
00053   std::wstring lemma;
00055   std::wstring tag;
00057   double prob;
00059   double distance;
00061   std::list<std::pair<std::wstring,double> > senses;
00063   std::list<word> retok;
00064 
00065   // cache for short versions of Eagles tags;
00066   static std::map<std::wstring,std::wstring> stag_cache;
00067 
00068   std::wstring shorten_tag(const std::wstring &s=L"V") const;
00069   
00070   // store which sequences --among the kbest proposed by 
00071   // the tagger-- contain this analysis
00072   std::set<int> selected_kbest;
00073 
00074  public:
00076   std::vector<std::wstring> user;
00077 
00079   analysis();
00081   analysis(const std::wstring &, const std::wstring &);
00083   analysis& operator=(const analysis&);
00084 
00085   void set_lemma(const std::wstring &);
00086   void set_tag(const std::wstring &);
00087   void set_prob(double);
00088   void set_distance(double);
00089   void set_retokenizable(const std::list<word> &);
00090 
00091   bool has_prob() const;
00092   bool has_distance() const;
00093   std::wstring get_lemma() const;
00094   std::wstring get_tag() const;
00095   std::wstring get_short_tag() const;
00096   std::wstring get_short_tag(const std::wstring &) const;
00097   double get_prob() const;
00098   double get_distance() const;
00099   bool is_retokenizable() const;
00100   std::list<word> get_retokenizable() const;
00101 
00102   std::list<std::pair<std::wstring,double> > get_senses() const;
00103   void set_senses(const std::list<std::pair<std::wstring,double> > &);
00104   // useful for java API
00105   std::wstring get_senses_string() const;
00106 
00107   // get the largest kbest sequence index the analysis is selected in.
00108   int max_kbest() const;
00109   // find out whether the analysis is selected in the tagger k-th best sequence
00110   bool is_selected(int k=0) const;
00111   // mark this analysis as selected in k-th best sequence
00112   void mark_selected(int k=0);
00113   // unmark this analysis as selected in k-th best sequence
00114   void unmark_selected(int k=0);
00115 
00117   bool operator<(const analysis &) const;
00119   bool operator==(const analysis &) const;
00120 };
00121 
00122 
00127 
00128 class WINDLL word : public std::list<analysis> {
00129  private:
00131   std::wstring form;
00133   std::wstring lc_form;
00135   std::wstring ph_form;
00137   std::list<word> multiword;
00139   bool ambiguous_mw;
00141   std::list<std::pair<word,double> > alternatives;
00143   unsigned long start, finish;
00145   bool in_dict;
00147   bool locked;
00149   void clone(const word &);
00150  
00152   static const int SELECTED=0;
00153   static const int UNSELECTED=1;
00154   static const int ALL=2;
00155 
00156  public:
00157   // predeclarations
00158   class iterator; 
00159   class const_iterator; 
00160 
00162   std::vector<std::wstring> user;
00163 
00165   word();
00167   word(const std::wstring &);
00169   word(const std::wstring &, const std::list<word> &);
00171   word(const std::wstring &, const std::list<analysis> &, const std::list<word> &);
00173   word(const word &);
00175   word& operator=(const word&);
00176 
00178   void copy_analysis(const word &);
00180   int get_n_selected(int k=0) const;
00182   int get_n_unselected(int k=0) const;
00184   bool is_multiword() const;
00186   bool is_ambiguous_mw() const;
00188   void set_ambiguous_mw(bool);
00190   int get_n_words_mw() const;
00192   std::list<word> get_words_mw() const;
00194   std::wstring get_form() const;
00196   std::wstring get_lc_form() const;
00198   std::wstring get_ph_form() const;
00200   word::iterator selected_begin(int k=0);
00202   word::const_iterator selected_begin(int k=0) const;
00204   word::iterator selected_end(int k=0);
00206   word::const_iterator selected_end(int k=0) const;
00208   word::iterator unselected_begin(int k=0);
00210   word::const_iterator unselected_begin(int k=0) const;
00212   word::iterator unselected_end(int k=0);
00214   word::const_iterator unselected_end(int k=0) const;
00216   unsigned int num_kbest() const;
00218   std::wstring get_lemma(int k=0) const;
00220   std::wstring get_tag(int k=0) const;
00222   std::wstring get_short_tag(int k=0) const;
00224   std::wstring get_short_tag(const std::wstring &,int k=0) const;
00225 
00227   std::list<std::pair<std::wstring,double> > get_senses(int k=0) const;
00228   // useful for java API
00229   std::wstring get_senses_string(int k=0) const;
00231   void set_senses(const std::list<std::pair<std::wstring,double> > &, int k=0);
00232 
00234   unsigned long get_span_start() const;
00235   unsigned long get_span_finish() const;
00236 
00238   bool found_in_dict() const;
00240   void set_found_in_dict(bool);
00242   bool has_retokenizable() const;
00244   void lock_analysis();
00246   bool is_locked() const;
00247 
00249   void add_alternative(const word &, double);
00251   void set_alternatives(const std::list<std::pair<word,double> > &);
00253   bool has_alternatives() const;
00255   std::list<std::pair<word,double> > get_alternatives() const;
00257   std::list<std::pair<word,double> >::iterator alternatives_begin();
00259   std::list<std::pair<word,double> >::iterator alternatives_end();
00260 
00262   void add_analysis(const analysis &);
00264   void set_analysis(const analysis &);
00266   void set_analysis(const std::list<analysis> &);
00268   void set_form(const std::wstring &);
00270   void set_ph_form(const std::wstring &);
00272   void set_span(unsigned long, unsigned long);
00273 
00275   bool find_tag_match(boost::u32regex &);
00276 
00278   int get_n_analysis() const;
00280   void unselect_all_analysis(int k=0);
00282   void select_all_analysis(int k=0);
00284   void select_analysis(word::iterator, int k=0);
00286   void unselect_analysis(word::iterator, int k=0);
00288   std::list<analysis> get_analysis() const;
00290   word::iterator analysis_begin();
00291   word::const_iterator analysis_begin() const;
00293   word::iterator analysis_end();
00294   word::const_iterator analysis_end() const;
00295 
00297   class WINDLL iterator : public std::list<analysis>::iterator {
00298     friend class word::const_iterator;
00299     private:
00301       std::list<analysis>::iterator ibeg;
00303       std::list<analysis>::iterator iend;
00305       int type;
00307       int kbest;
00308 
00309     public:
00311       iterator();
00313       iterator(const word::iterator &);
00315       iterator(const std::list<analysis>::iterator &);
00317       iterator(const std::list<analysis>::iterator &, 
00318                const std::list<analysis>::iterator &, 
00319                const std::list<analysis>::iterator &, int,int k=0);  
00321       iterator& operator++();
00322       iterator operator++(int);
00323   };
00324   
00326   class WINDLL const_iterator : public std::list<analysis>::const_iterator {
00327     private:
00329       std::list<analysis>::const_iterator ibeg;
00331       std::list<analysis>::const_iterator iend;
00333       int type;
00335       int kbest;
00336 
00337     public:
00339       const_iterator();
00341       const_iterator(const word::const_iterator &);
00343       const_iterator(const word::iterator &);
00345       const_iterator(const std::list<analysis>::const_iterator &);
00347       const_iterator(const std::list<analysis>::iterator &);
00349       const_iterator(const std::list<analysis>::const_iterator &,
00350                      const std::list<analysis>::const_iterator &, 
00351                      const std::list<analysis>::const_iterator &, int, int k=0);
00353       const_iterator& operator++();  
00354       const_iterator operator++(int);  
00355   };
00356 
00357 };
00358 
00359 
00360 
00366 
00367 class WINDLL node {
00368  protected:
00370   std::wstring nodeid;
00372   bool head;
00374   int chunk;
00376   std::wstring label;
00378   word * w;
00379 
00380  public:
00382   std::vector<std::wstring> user;
00383 
00385   node();
00386   node(const std::wstring &);
00388   std::wstring get_node_id() const;
00390   void set_node_id(const std::wstring &);
00392   std::wstring get_label() const;
00394   word get_word() const;
00396   word& get_word();
00398   void set_label(const std::wstring &);
00400   void set_word(word &);
00402   bool is_head() const;
00404   void set_head(const bool);
00406   bool is_chunk() const;
00408   void set_chunk(const int);
00410   int  get_chunk_ord() const;
00411 
00412 };
00413 
00417 
00418 class WINDLL parse_tree : public tree<node> {
00419  private:
00420   std::map<std::wstring,parse_tree::iterator> node_index;
00421 
00422  public:
00423   parse_tree();
00424   parse_tree(parse_tree::iterator p);
00425   parse_tree(const node &);
00426 
00428   void build_node_index();
00430   void rebuild_node_index();
00432   parse_tree::iterator get_node_by_id(std::wstring) const;
00433 
00434   static int nsentence;
00435 };
00436 
00437 
00442 
00443 class WINDLL depnode : public node {
00444 
00445  private:
00447   parse_tree::iterator link;
00448 
00449  public:
00450   depnode();
00451   depnode(const std::wstring &);
00452   depnode(const node &);
00453   void set_link(const parse_tree::iterator);
00454   parse_tree::iterator get_link();
00455   parse_tree::const_iterator get_link() const;
00457   tree<node>& get_link_ref();
00458 };
00459 
00460 
00461 
00465 
00466 class WINDLL dep_tree :  public tree<depnode> {
00467  public:
00468   dep_tree();
00469   dep_tree(const depnode &);
00470 };
00471 
00472 
00478 
00479 class WINDLL sentence : public std::list<word> {
00480  private:
00481   // vector with pointers to sentence words, for fast access by position
00482   std::vector<word*> wpos; 
00483   // parse tree (if sentence parsed)
00484   std::map<int,parse_tree> pts;
00485   // dependencey tree (if sentence dep. parsed)
00486   std::map<int,dep_tree> dts;
00488   void clone(const sentence &);
00489 
00490  public:
00491   sentence();
00492   sentence(const std::list<word>&);
00494   sentence(const sentence &);
00496   sentence& operator=(const sentence&);
00498   const word& operator[](size_t) const;
00499   word& operator[](size_t);
00501   unsigned int num_kbest() const;
00503   void push_back(const word &);
00505   void rebuild_word_index();
00506  
00507   void clear();
00508 
00509   void set_parse_tree(const parse_tree &, int k=0);
00510   parse_tree & get_parse_tree(int k=0);
00511   const parse_tree & get_parse_tree(int k=0) const;
00512   bool is_parsed() const;
00513 
00514   void set_dep_tree(const dep_tree &, int k=0);
00515   dep_tree & get_dep_tree(int k=0);
00516   const dep_tree & get_dep_tree(int k=0) const;
00517   bool is_dep_parsed() const;
00518 
00520   std::vector<word> get_words() const;
00522   sentence::iterator words_begin();
00523   sentence::const_iterator words_begin() const;
00524   sentence::iterator words_end();
00525   sentence::const_iterator words_end() const;
00526 };
00527 
00532 
00533 class WINDLL paragraph : public std::list<sentence> {
00534  public:
00535   paragraph() {}
00536   virtual ~paragraph() {}
00537 };
00538 
00543 
00544 class WINDLL document : public std::list<paragraph> {
00545 
00546  private:
00547   paragraph title;
00548   std::multimap<int,std::wstring> group2node;
00549   std::map<std::wstring,int> node2group;
00550   int last_group;
00551 
00552  public:
00553   document();
00555   void add_positive(std::wstring node1, int group1);
00557   void add_positive(std::wstring,std::wstring);
00559   int get_coref_group(std::wstring) const;
00561   std::list<std::wstring> get_coref_nodes(int) const;
00563   bool is_coref(std::wstring,std::wstring) const;
00564 };
00565 
00566 
00567 #endif
00568