|
FreeLing
3.0
|
00001 00003 // 00004 // FreeLing - Open Source Language Analyzers 00005 // 00006 // Copyright (C) 2004 TALP Research Center 00007 // Universitat Politecnica de Catalunya 00008 // 00009 // This library is free software; you can redistribute it and/or 00010 // modify it under the terms of the GNU General Public 00011 // License as published by the Free Software Foundation; either 00012 // version 3 of the License, or (at your option) any later version. 00013 // 00014 // This library is distributed in the hope that it will be useful, 00015 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00016 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00017 // General Public License for more details. 00018 // 00019 // You should have received a copy of the GNU General Public 00020 // License along with this library; if not, write to the Free Software 00021 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 00022 // 00023 // contact: Lluis Padro (padro@lsi.upc.es) 00024 // TALP Research Center 00025 // despatx C6.212 - Campus Nord UPC 00026 // 08034 Barcelona. SPAIN 00027 // 00029 00030 #ifndef _DATES_MOD 00031 #define _DATES_MOD 00032 00033 #include <map> 00034 #include <boost/regex/icu.hpp> 00035 00036 #include "freeling/morfo/language.h" 00037 #include "freeling/morfo/automat.h" 00038 00039 00040 // Date/time regular expressions definitions 00041 00042 const std::wstring RE_ROMAN=L"^([IVXLCDM]+)$"; 00043 00044 // Default: 00045 const std::wstring RE_DATE_DF=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d)))/)(\\d{1,4}))$"; 00046 const std::wstring RE_TIME1_DF=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:min|m)?)?)$"; 00047 const std::wstring RE_TIME2_DF=L"^(?:((?:[0-5])?(?:\\d))(?:min\\.?|m\\.?))$"; 00048 00049 // Spanish: 00050 const std::wstring RE_DATE_ES=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|ene|feb|mar|abr|may|jun|jul|ago|sep|oct|nov|dic)/)(\\d{1,4}))$"; 00051 const std::wstring RE_TIME1_ES=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minutos|min|m)?)?)$"; 00052 const std::wstring RE_TIME2_ES=L"^(?:((?:[0-5])?(?:\\d))(?:minutos|min\\.?|m\\.?))$"; 00053 00054 // Catalan: 00055 const std::wstring RE_DATE_CA=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|gener|febrer|març|abril|maig|juny|juliol|agost|setembre|octubre|novembre|desembre|gen|feb|mar|abr|mai|jun|jul|ago|set|oct|nov|des)/)(\\d{1,4}))$"; 00056 const std::wstring RE_TIME1_CA=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minuts|min|m)?)?)$"; 00057 const std::wstring RE_TIME2_CA=L"^(?:((?:[0-5])?(?:\\d))(?:minuts|min\\.?|m\\.?))$"; 00058 00059 // English: 00060 const std::wstring RE_DATE_EN=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)/)(\\d{1,4}))$"; 00061 const std::wstring RE_TIME1_EN=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minutes|min|m)?)?)$"; 00062 const std::wstring RE_TIME2_EN=L"^(?:((?:[0-5])?(?:\\d))(?:minutes|min\\.?|m\\.?))$"; 00063 00064 // Galician: 00065 const std::wstring RE_DATE_GL=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|xaneiro|febreiro|marzo|abril|maio|xuño|xullo|agosto|setembro|outubro|novembro|decembro|xan|feb|mar|abr|mai|xuñ|xul|ago|set|out|nov|dec)/)(\\d{1,4}))$"; 00066 const std::wstring RE_TIME1_GL=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minutos|min|m)?)?)$"; 00067 const std::wstring RE_TIME2_GL=L"^(?:((?:[0-5])?(?:\\d))(?:minutos|min\\.?|m\\.?))$"; 00068 00069 // Portuguese: 00070 const std::wstring RE_DATE_PT=L"^(?:(?:((?:[0-3])?(?:\\d))/)(?:((?:(?:[0-1])?(?:\\d))|janeiro|fevereiro|março|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro|jan|fev|mar|abr|mai|jun|jul|ago|set|out|nov|dez)/)(\\d{1,4}))$"; 00071 const std::wstring RE_TIME1_PT=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:h|:)(?:((?:[0-5])?(?:\\d))(?:minutos|min|m)?)?)$"; 00072 const std::wstring RE_TIME2_PT=L"^(?:((?:[0-5])?(?:\\d))(?:minutos|min\\.?|m\\.?))$"; 00073 00074 // Russian: 00075 const std::wstring RE_DATE_RU=L"^([0]?[1-9]|[1|2][0-9]|[3][0|1])[./]([0]?[1-9]|[1][0-2])[./]([0-9]{4}|[0-9]{2})$"; 00076 const std::wstring RE_TIME_RU=L"^(?:((?:(?:[0-1])?(?:\\d))|(?:2(?:[0-4])))(?:ч\\.?|:)(?:((?:[0-5])?(?:\\d))(?:минуты?|мин\\.?)?)?)$"; 00077 const std::wstring RE_MINUTES_RU=L"^(?:((?:[0-5])?(?:\\d))(?:минуты?|мин\\.?))$"; 00078 00079 // Value of unspecified fields in normalized date 00080 const std::wstring UNKNOWN_SYMB = L"??"; 00081 00082 00087 00088 class dates_module: public automat { 00089 00090 protected: 00092 std::map<std::wstring,int> nMes; 00094 std::map<std::wstring,std::wstring> nDia; 00096 std::wstring century,year,month,day,weekday,hour,minute,meridian; 00098 int temp; 00099 int sign; // for Catalan "un quart menys(-1)/i(1) cinc de sis" or 00100 // for English: a quarter to(-1)/past(1) five. 00101 00102 int daytemp; // for special state Gbb in English 00103 bool inGbb; 00104 00106 std::map<std::wstring,int> tok; 00107 00108 // required regular expressions objects 00109 boost::u32regex RE_Date; 00110 boost::u32regex RE_Time1; 00111 boost::u32regex RE_Time2; 00112 boost::u32regex RE_Roman; 00113 00114 // remember results of last matched RegEx 00115 boost::wsmatch rem; 00116 00117 // to unify notation (01 -> 1), maybe adding an offset 00118 std::wstring normalize(const std::wstring &in, int offs=0); 00119 00120 private: 00121 virtual void ResetActions(); 00122 00123 public: 00125 dates_module(const std::wstring &, const std::wstring &, const std::wstring &, const std::wstring &); 00126 virtual ~dates_module() {} 00127 }; 00128 00129 00134 00135 class dates_default : public dates_module { 00136 00137 private: 00138 int ComputeToken(int, sentence::iterator &, sentence &); 00139 void StateActions(int, int, int, sentence::const_iterator); 00140 void SetMultiwordAnalysis(sentence::iterator, int); 00141 00142 public: 00144 dates_default(); 00145 }; 00146 00151 00152 class dates_es : public dates_module { 00153 00154 private: 00155 int ComputeToken(int, sentence::iterator &, sentence &); 00156 void StateActions(int, int, int, sentence::const_iterator); 00157 void SetMultiwordAnalysis(sentence::iterator, int); 00158 00159 public: 00161 dates_es(); 00162 }; 00163 00164 00169 00170 class dates_ca : public dates_module { 00171 00172 private: 00173 int ComputeToken(int, sentence::iterator &, sentence &); 00174 void StateActions(int, int, int, sentence::const_iterator); 00175 void SetMultiwordAnalysis(sentence::iterator, int); 00176 00177 public: 00179 dates_ca(); 00180 }; 00181 00186 00187 class dates_gl : public dates_module { 00188 00189 private: 00190 int ComputeToken(int, sentence::iterator &, sentence &); 00191 void StateActions(int, int, int, sentence::const_iterator); 00192 void SetMultiwordAnalysis(sentence::iterator, int); 00193 00194 public: 00196 dates_gl(); 00197 }; 00198 00203 00204 class dates_pt : public dates_module { 00205 00206 private: 00207 int ComputeToken(int, sentence::iterator &, sentence &); 00208 void StateActions(int, int, int, sentence::const_iterator); 00209 void SetMultiwordAnalysis(sentence::iterator, int); 00210 00211 public: 00213 dates_pt(); 00214 }; 00215 00216 00221 00222 class dates_en : public dates_module { 00223 00224 private: 00225 int ComputeToken(int, sentence::iterator &, sentence &); 00226 void StateActions(int, int, int, sentence::const_iterator); 00227 void SetMultiwordAnalysis(sentence::iterator, int); 00228 00230 std::map<std::wstring,int> numDay; 00231 00232 public: 00234 dates_en(); 00235 }; 00236 00241 00242 class dates_ru : public dates_module { 00243 00244 private: 00245 int ComputeToken(int, sentence::iterator &, sentence &); 00246 void StateActions(int, int, int, sentence::const_iterator); 00247 void SetMultiwordAnalysis(sentence::iterator, int); 00248 00249 int GetPrevStateValue(); 00250 void SetPrevStateValue(int); 00251 00252 public: 00254 dates_ru(); 00255 }; 00256 00257 00258 #endif 00259
1.7.6.1