FreeLing  3.0
dates_modules.h
Go to the documentation of this file.
00001 
00003 //
00004 //    FreeLing - Open Source Language Analyzers
00005 //
00006 //    Copyright (C) 2004   TALP Research Center
00007 //                         Universitat Politecnica de Catalunya
00008 //
00009 //    This library is free software; you can redistribute it and/or
00010 //    modify it under the terms of the GNU General Public
00011 //    License as published by the Free Software Foundation; either
00012 //    version 3 of the License, or (at your option) any later version.
00013 //
00014 //    This library is distributed in the hope that it will be useful,
00015 //    but WITHOUT ANY WARRANTY; without even the implied warranty of
00016 //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00017 //    General Public License for more details.
00018 //
00019 //    You should have received a copy of the GNU General Public
00020 //    License along with this library; if not, write to the Free Software
00021 //    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00022 //
00023 //    contact: Lluis Padro (padro@lsi.upc.es)
00024 //             TALP Research Center
00025 //             despatx C6.212 - Campus Nord UPC
00026 //             08034 Barcelona.  SPAIN
00027 //
00029 
00030 #ifndef _DATES_MOD
00031 #define _DATES_MOD
00032 
00033 #include <map>
00034 #include <boost/regex/icu.hpp>
00035 
00036 #include "freeling/morfo/language.h"
00037 #include "freeling/morfo/automat.h"
00038 
00039 
00040 // Date/time regular expressions definitions
00041 
00042 const std::wstring RE_ROMAN=L"^([IVXLCDM]+)$";
00043 
00044 // Default:
00045 const std::wstring RE_DATE_DF=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d)))/)(\\d{1,4}))$";
00046 const std::wstring RE_TIME1_DF=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:min|m)?)?)$";
00047 const std::wstring RE_TIME2_DF=L"^(?:((?:[0-5])?(?:\\d))(?:min\\.?|m\\.?))$";
00048 
00049 // Spanish:
00050 const std::wstring RE_DATE_ES=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|ene|feb|mar|abr|may|jun|jul|ago|sep|oct|nov|dic)/)(\\d{1,4}))$";
00051 const std::wstring RE_TIME1_ES=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minutos|min|m)?)?)$";
00052 const std::wstring RE_TIME2_ES=L"^(?:((?:[0-5])?(?:\\d))(?:minutos|min\\.?|m\\.?))$";
00053 
00054 // Catalan:
00055 const std::wstring RE_DATE_CA=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|gener|febrer|març|abril|maig|juny|juliol|agost|setembre|octubre|novembre|desembre|gen|feb|mar|abr|mai|jun|jul|ago|set|oct|nov|des)/)(\\d{1,4}))$";
00056 const std::wstring RE_TIME1_CA=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minuts|min|m)?)?)$";
00057 const std::wstring RE_TIME2_CA=L"^(?:((?:[0-5])?(?:\\d))(?:minuts|min\\.?|m\\.?))$";
00058 
00059 // English:
00060 const std::wstring RE_DATE_EN=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)/)(\\d{1,4}))$";
00061 const std::wstring RE_TIME1_EN=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minutes|min|m)?)?)$";
00062 const std::wstring RE_TIME2_EN=L"^(?:((?:[0-5])?(?:\\d))(?:minutes|min\\.?|m\\.?))$";
00063 
00064 // Galician:
00065 const std::wstring RE_DATE_GL=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|xaneiro|febreiro|marzo|abril|maio|xuño|xullo|agosto|setembro|outubro|novembro|decembro|xan|feb|mar|abr|mai|xuñ|xul|ago|set|out|nov|dec)/)(\\d{1,4}))$";
00066 const std::wstring RE_TIME1_GL=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minutos|min|m)?)?)$";
00067 const std::wstring RE_TIME2_GL=L"^(?:((?:[0-5])?(?:\\d))(?:minutos|min\\.?|m\\.?))$";
00068 
00069 // Portuguese:
00070 const std::wstring RE_DATE_PT=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|janeiro|fevereiro|março|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro|jan|fev|mar|abr|mai|jun|jul|ago|set|out|nov|dez)/)(\\d{1,4}))$";
00071 const std::wstring RE_TIME1_PT=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minutos|min|m)?)?)$";
00072 const std::wstring RE_TIME2_PT=L"^(?:((?:[0-5])?(?:\\d))(?:minutos|min\\.?|m\\.?))$";
00073 
00074 // Russian:
00075 const std::wstring RE_DATE_RU=L"^([0]?[1-9]|[1|2][0-9]|[3][0|1])[./]([0]?[1-9]|[1][0-2])[./]([0-9]{4}|[0-9]{2})$";
00076 const std::wstring RE_TIME_RU=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:ч\\.?|:)(?:((?:[0-5])?(?:\\d))(?:минуты?|мин\\.?)?)?)$";
00077 const std::wstring RE_MINUTES_RU=L"^(?:((?:[0-5])?(?:\\d))(?:минуты?|мин\\.?))$";
00078 
00079 // Value of unspecified fields in normalized date
00080 const std::wstring UNKNOWN_SYMB = L"??";
00081 
00082 
00087 
00088 class dates_module: public automat {
00089 
00090  protected:
00092   std::map<std::wstring,int> nMes;
00094   std::map<std::wstring,std::wstring> nDia;
00096   std::wstring century,year,month,day,weekday,hour,minute,meridian;
00098   int temp;
00099   int sign;  // for Catalan "un quart menys(-1)/i(1) cinc de sis" or 
00100   // for English: a quarter to(-1)/past(1) five.
00101 
00102   int daytemp; // for special state Gbb in English
00103   bool inGbb; 
00104 
00106   std::map<std::wstring,int> tok;
00107 
00108   // required regular expressions objects
00109   boost::u32regex RE_Date;
00110   boost::u32regex RE_Time1;
00111   boost::u32regex RE_Time2;
00112   boost::u32regex RE_Roman;
00113 
00114   // remember results of last matched RegEx
00115   boost::wsmatch rem; 
00116 
00117   // to unify notation (01 -> 1), maybe adding an offset
00118   std::wstring normalize(const std::wstring &in, int offs=0);
00119 
00120  private:
00121   virtual void ResetActions();
00122 
00123  public:
00125   dates_module(const std::wstring &, const std::wstring &, const std::wstring &, const std::wstring &); 
00126   virtual ~dates_module() {}
00127 };
00128 
00129 
00134 
00135 class dates_default : public dates_module {
00136 
00137  private:
00138   int ComputeToken(int, sentence::iterator &, sentence &);
00139   void StateActions(int, int, int, sentence::const_iterator);
00140   void SetMultiwordAnalysis(sentence::iterator, int);
00141 
00142  public:
00144   dates_default();
00145 };
00146 
00151 
00152 class dates_es : public dates_module {
00153 
00154  private:
00155   int ComputeToken(int, sentence::iterator &, sentence &);
00156   void StateActions(int, int, int, sentence::const_iterator);
00157   void SetMultiwordAnalysis(sentence::iterator, int);
00158 
00159  public:
00161   dates_es();
00162 };
00163 
00164 
00169 
00170 class dates_ca : public dates_module {
00171 
00172  private:
00173   int ComputeToken(int, sentence::iterator &, sentence &);
00174   void StateActions(int, int, int, sentence::const_iterator);
00175   void SetMultiwordAnalysis(sentence::iterator, int);
00176 
00177  public:
00179   dates_ca();
00180 };
00181 
00186 
00187 class dates_gl : public dates_module {
00188 
00189  private:
00190   int ComputeToken(int, sentence::iterator &, sentence &);
00191   void StateActions(int, int, int, sentence::const_iterator);
00192   void SetMultiwordAnalysis(sentence::iterator, int);
00193 
00194  public:
00196   dates_gl();
00197 };
00198 
00203 
00204 class dates_pt : public dates_module {
00205 
00206  private:
00207   int ComputeToken(int, sentence::iterator &, sentence &);
00208   void StateActions(int, int, int, sentence::const_iterator);
00209   void SetMultiwordAnalysis(sentence::iterator, int);
00210 
00211  public:
00213   dates_pt();
00214 };
00215 
00216 
00221 
00222 class dates_en : public dates_module {
00223 
00224  private:
00225   int ComputeToken(int, sentence::iterator &, sentence &);
00226   void StateActions(int, int, int, sentence::const_iterator);
00227   void SetMultiwordAnalysis(sentence::iterator, int);
00228 
00230   std::map<std::wstring,int> numDay;
00231 
00232  public:
00234   dates_en();
00235 };
00236 
00241 
00242 class dates_ru : public dates_module {
00243 
00244  private:
00245   int ComputeToken(int, sentence::iterator &, sentence &);
00246   void StateActions(int, int, int, sentence::const_iterator);
00247   void SetMultiwordAnalysis(sentence::iterator, int);
00248 
00249   int GetPrevStateValue();
00250   void SetPrevStateValue(int);
00251 
00252  public:
00254   dates_ru();
00255 };
00256 
00257 
00258 #endif
00259