|
FreeLing
3.0
|
00001 /* 00002 * String aligner for the Phonetic Distance Package 00003 * + 00004 * + April 2006 pcomas@lsi.upc.edu 00005 * 00006 */ 00007 #ifndef _aligner_h 00008 #define _aligner_h 00009 00010 #include <iostream> 00011 #include <fstream> 00012 #include <sstream> 00013 #include <map> 00014 #include <set> 00015 #include <string> 00016 #include <math.h> 00017 00018 #include "freeling/morfo/phd.h" 00019 00020 #define GLOBAL 1 // The aligner produces local alignments 00021 #define SEMILOCAL 2 // The aligner produces semilocal alignments 00022 #define LOCAL 3 // The aligner produces global alignments 00023 00024 00025 template<typename T=int> 00026 class aligner{ 00027 00028 private: 00029 phd<T>* sc; 00030 T score; 00031 int debug; 00032 00033 public: 00034 00035 struct alin { 00036 T score; // score of this alignment 00037 double scoren; // score normalized 00038 double context; 00039 int Psubstitutions; 00040 int Pinserts; 00041 int Pdeletions; 00042 int Wsubstitutions; 00043 int Winserts; 00044 int Wdeletions; 00045 unsigned int kword; //The number of keyword involved in this alignment 00046 int begin; // index in b[] where the alignment starts 00047 int end; // index in b[] where the alignment ends 00048 int beginW; //number of word in b[] where the alignment starts (each ' ' changes the word) 00049 int endW; //number of word in b[] where the alignment ends 00050 wchar_t* seg; // Segment of b[] with the letters 00051 wchar_t* a; // Alignment of a 00052 wchar_t* b; // Alignment of b 00053 bool good; // Marks if this alignment is selected as relevant 00054 00055 ~alin(){ 00056 delete[](a); 00057 delete[](b); 00058 delete[](seg); 00059 } 00060 00061 alin(T _score, int _begin, int _end, int _beginW, int _endW, int _substitutions, int _inserts, int _deletions, wchar_t* _a, wchar_t* _b) : 00062 score(_score), Psubstitutions(_substitutions), Pinserts(_inserts), Pdeletions(_deletions), begin(_begin), end(_end), beginW(_beginW), endW(_endW), seg(NULL), a(_a), b(_b) {} 00063 }; 00064 00065 00066 aligner(const std::wstring &fname, int const _debug = 0){ 00067 sc = new phd<T>(fname); 00068 debug=_debug; 00069 // if(debug>3){ sc->show(cerr);} 00070 } //constructor 00071 00072 ~aligner(){ 00073 delete(sc); 00074 } 00075 00076 00077 /* 00078 * Alinia la cadena A contra la cadena B. Conceptualment es considera que A és la query 00079 */ 00080 alin* align(const std::wstring &wa, const std::wstring &wb, const int mode = SEMILOCAL){ 00081 // a is the short string with length tj 00082 // b is the long string with length ti 00083 // The algorithm searchs for the best match of a against b in a semi-local or global point of view. 00084 // Usage of global matching is discouraged for the sake of coherence :-) because it doesn't try 00085 // to keep the letters together so 'a' will be meaningless scattered through all 'b' 00086 00087 const wchar_t *a=wa.c_str(); const int tj=wa.size(); 00088 const wchar_t *b=wb.c_str(); const int ti=wb.size(); 00089 00090 /* 00091 // Build the align matrix 00092 */ 00093 int const W = ti+1; 00094 00095 if( ti==0 || tj == 0 ){ return new alin(0,0,0,0,0,0,0,0,0,0); } 00096 int i,j; 00097 int* m = new int[W*(tj+1)+ti+1]; //int m[tj+1][ti+1]; 00098 00099 // Variables for alignment reconstruction 00100 wchar_t* answerA = new wchar_t[tj+ti]; 00101 wchar_t* answerB = new wchar_t[tj+ti]; 00102 int pA = 0; 00103 int pB = 0; 00104 int insertions=0, deletions=0, substitutions=0; 00105 int spacesA=0; 00106 00107 00108 // INITIALIZATION STEP 00109 00110 // Decide the number of word that each character belongs to 00111 int nwords=0; 00112 for(int i=0; i<ti; i++){ 00113 if( b[i] == L' ' || b[i] == L'_' ){ 00114 // word change at _i_ 00115 nwords++; 00116 } 00117 } 00118 00119 int* words = new int[nwords]; 00120 j=0; 00121 for(int i=0; i<ti; i++){ 00122 if( b[i] == L' ' || b[i] == L'_' ){ 00123 words[j++] = i; 00124 } 00125 } 00126 00127 switch(mode){ 00128 case GLOBAL: //Start values are skips 00129 m[0]=0; 00130 for(int j=1; j<=tj; j++){ m[j*W] = m[(j-1)*W] + sc->dSkip(a[j-1]); } 00131 for(int i=1; i<=ti; i++){ m[i] = m[i-1] + sc->dSkip(b[i-1]); } 00132 break; 00133 default: //Start values are 0 00134 for(int j=0;j<=tj;j++){ m[j*W] = 0; } 00135 for(int i=1;i<=ti;i++){ m[i] = 0; } 00136 break; 00137 } 00138 00139 00140 /* 00141 // Throw the scoring process 00142 */ 00143 int i1, i2, i3, i4, i5, indexInit, indexEnd, bestJ, bestI; 00144 indexEnd = 0; 00145 indexInit = 0; 00146 int initWord = 0; 00147 int endWord = 0; 00148 score = -100000000; 00149 bestJ = tj; 00150 bestI = ti; 00151 00152 // FILL THE FIRST COLUMN & ROW 00153 for(j=1;j<=tj;j++){ 00154 m[j*W+1] = std::max( m[W*(j-1)]+sc->dSub(a[j-1],b[0]) , m[W*(j-1)+1]+sc->dSkip(a[j-1]) ); 00155 if(a[j-1]==L' '|| a[j-1]==L'_' ){spacesA++;} 00156 if(score < m[W*j+1]){ 00157 score = m[W*j+1]; 00158 bestJ = j; 00159 bestI = 0; 00160 } 00161 } 00162 for(i=1;i<=ti;i++){ 00163 m[ W+i ] = std::max( m[ i-1 ]+sc->dSub(a[0],b[i-1]) , m[ W+i-1 ]+sc->dSkip(b[i-1]) ); 00164 if(score < m[W+i]){ 00165 score = m[W+i]; 00166 bestJ = 0; 00167 bestI = i; 00168 } 00169 } 00170 00171 //FILL THE REST OF THE MATRIX 00172 //int step = ti/4; 00173 int lowLimit = 0; 00174 int upperLimit = 0; 00175 int threshold = -10000000; 00176 if( mode == LOCAL ){ threshold = 0; } 00177 00178 for(j=2;j<=tj;j++){ 00179 //if( mode==GLOBAL && tj/ti> 0.75){ 00180 // // If the scoring is GLOBAL we do not fill the entire matrix 00181 // lowLimit = max( 2 , j-step ); 00182 // upperLimit = min( ti , j+step ); 00183 // m[W*j+lowLimit+1] = 0; 00184 // } else { 00185 lowLimit = 2; 00186 upperLimit = ti; 00187 // } 00188 00189 for(i=lowLimit;i<=upperLimit;i++){ 00190 i1 = m[W*(j-1)+i-1] + sc->dSub(a[j-1],b[i-1]); 00191 i2 = m[W*j+i-1] + sc->dSkip(b[i-1]); 00192 i3 = m[W*(j-1)+i-2] + sc->dExp(a[j-1],b[i-2],b[i-1]); 00193 i4 = m[W*(j-1)+i] + sc->dSkip(a[j-1]); 00194 i5 = m[W*(j-2)+i-1] + sc->dExp(b[i-1],a[j-2],a[j-1]); 00195 m[W*j+i] = std::max(threshold,std::max(i1,std::max(i1,std::max(i2,std::max(i3,std::max(i4,i5)))))); 00196 00197 if(score < m[W*j+i]){ 00198 score = m[W*j+i]; 00199 bestJ = j; 00200 bestI = i; 00201 } 00202 00203 } 00204 } 00205 00206 //Prints the full score matrix if needed 00207 /*if(debug>5){ 00208 cerr << endl << "\t"; 00209 for(i=0;i<ti;i++){ cerr << "\t" << b[i]; } 00210 cerr << endl << "\t"; 00211 for(j=0;j<=tj;j++){ 00212 if(j>0) cerr << a[j-1] << "\t"; 00213 for(i=0;i<=ti;i++){ 00214 cerr << m[W*j+i] << "\t"; 00215 } 00216 cerr << endl; 00217 } 00218 }*/ 00219 00220 00221 /* 00222 || Reconstruction Step 00223 */ 00224 bool final = true; //Exit condition 00225 int lastScore = score; 00226 int steps = 0; //Number of steps uset to align 'a' with 'b' 00227 int lastChar = 0; 00228 00229 00230 int ni=0; 00231 int mymax=0; 00232 00233 switch(mode){ 00234 case SEMILOCAL: 00235 /* In semi-local alignment, the best-path reconstructions doesn't start from the 00236 * botom-right corner of the matrix but from the best-scoring cell among last 00237 * column and last row scores. 00238 * In this implementation only the last row is taken in account since it is 00239 * the "long" string 00240 */ 00241 for(i=0;i<=ti;i++){ 00242 if( mymax < m[W*tj+i] ){ 00243 mymax = m[W*tj+i]; 00244 ni=i; 00245 } 00246 } 00247 score = mymax; 00248 for(i=ti-1;i>=ni;i--){ // Carry on half of the string for debugging purpouses 00249 answerA[pA++]= L'-'; 00250 answerB[pB++]= b[i]; 00251 insertions++; 00252 } 00253 i=ni; j=tj; 00254 break; 00255 case GLOBAL: // now this is global alignment 00256 i = ti; j = tj; 00257 break; 00258 case LOCAL: 00259 /* In LOCAL alignment, the best-path reconstruction starts with at the best scoring 00260 * position in the whole matrix. 00261 */ 00262 j = bestJ; 00263 i = bestI; 00264 break; 00265 } 00266 00267 00268 while( final ){ 00269 00270 /*if(debug>5){ 00271 cerr << "Looking for " << m[ W*j +i] << " in ("<<j<<","<<i<<")"<<endl; 00272 cerr << "m[j-1][i-1]=" << m[ W*(j-1) +i-1] << ": dSub=" << sc->dSub(a[j-1],b[i-1]) << endl; 00273 cerr << "m[j][i-1]=" << m[ W*j +i-1] << ": dSkipB=" << sc->dSkip(b[i-1]) << endl; 00274 cerr << "m[j-1][i]=" << m[ W*(j-1) +i] << ": dSkipA=" << sc->dSkip(a[j-1]) << endl; 00275 if(j>1){ cerr << "m[j-1][i-2]=" << m[ W*(j-1) +i-2] << ": dExp=" << sc->dExp(a[j-1],b[i-2],b[i-1]) << endl; } 00276 if(i>1){ cerr << "m[j-2][i-1]=" << m[ W*(j-2) +i-1] << ": dExp=" << sc->dExp(b[i-1],a[j-2],a[j-1]) << endl; } 00277 }*/ 00278 00279 00280 if( j>0 && i>0 && m[W*j+i] == m[W*(j-1)+i-1] + sc->dSub(a[j-1],b[i-1]) ){ 00281 //Aliniar b[i] amb a[j] 00282 lastScore = sc->dSub(a[j-1],b[i-1]); 00283 i--; j--; 00284 //cerr << "Sub a[" << j << "]=" << a[j] << " b[" << i << "]=" << b[i] << " @ " << pA << endl; 00285 answerA[pA++] = a[j]; 00286 answerB[pB++] = b[i]; 00287 indexInit= i; 00288 indexEnd = std::max(i,indexEnd); 00289 if( a[j]!=b[i] ){ substitutions++; } 00290 steps++; 00291 lastChar = steps; 00292 00293 } else if ( i>0 && j>0 && m[W*j+i] == m[W*j+i-1] + sc->dSkip(b[i-1]) ) { 00294 lastScore = sc->dSkip(b[i-1]); 00295 i--; 00296 answerA[pA++] = L'-'; 00297 answerB[pB++] = b[i]; 00298 insertions++; 00299 if(steps==0 && b[i]==L' ' && b[i]==L'_' ){ 00300 score = m[W*j+i]; 00301 } else { 00302 steps++; 00303 } 00304 00305 } else if( j>1 && i>0 && m[W*j+i] == m[W*(j-2)+i-1] + sc->dExp(b[i-1],a[j-2],a[j-1]) ){ 00306 lastScore = sc->dExp(b[i-1],a[j-2],a[j-1]); 00307 i--; j-=2; 00308 answerA[pA++] = a[j]; 00309 answerA[pA++] = a[j+1]; 00310 answerB[pB++] = b[i]; 00311 answerB[pB++] = L'+'; 00312 indexInit = i; 00313 indexEnd = std::max(i,indexEnd); 00314 deletions++; 00315 if( a[j]!=b[i] ){ substitutions++; } 00316 steps+=2; 00317 lastChar = steps; 00318 00319 } else if( j>0 && i>0 && m[W*j+i] == m[W*(j-1)+i] + sc->dSkip(a[j-1]) ){ 00320 lastScore = sc->dSkip(a[j-1]); 00321 j--; 00322 answerA[pA++] = a[j]; 00323 answerB[pB++] = L'-'; 00324 indexInit = i; 00325 indexEnd = std::max(i,indexEnd); 00326 deletions++; 00327 steps++; 00328 lastChar = steps; 00329 00330 } else if ( i>1 && j>0 && m[W*j+i] == m[ W*(j-1) +i-2 ] + sc->dExp(a[j-1],b[i-2],b[i-1]) ) { 00331 lastScore = sc->dExp(a[j-1],b[i-2],b[i-1]); 00332 j--; i--; 00333 answerA[pA++] = a[j]; 00334 answerA[pA++] = L'+'; 00335 answerB[pB++] = b[i]; 00336 answerB[pB++] = b[i-1]; 00337 indexInit = i; 00338 indexEnd = std::max(i,indexEnd); 00339 insertions++; 00340 if( a[j]!=b[i] ){ substitutions++; } 00341 i--; 00342 steps+=2; 00343 lastChar = steps; 00344 00345 } else if ( j==0 ){ 00346 i--; 00347 answerA[pA++] = L'-'; 00348 answerB[pB++] = b[i]; 00349 insertions++; 00350 if(steps!=0 || b[i]!=L' ' || b[i]!=L'_' ) steps++; 00351 00352 } else if ( i==0 ){ 00353 j--; 00354 answerA[pA++] = a[j]; 00355 answerB[pB++] = L'-'; 00356 deletions++; 00357 00358 if(steps!=0 || b[i]!=L' ' || b[i]!=L'_' ) steps++; 00359 lastChar = steps; 00360 00361 } else { 00362 std::wcerr << L"BOINK! Error at "<< j << L"," << i << std::endl; 00363 break; 00364 } 00365 00366 // This is the termination condition 00367 switch(mode){ 00368 case SEMILOCAL: 00369 final = j!=0; 00370 if(!final){ 00371 i--; 00372 while(i>=0){ 00373 answerA[pA++] = L'-'; 00374 answerB[pB++] = b[i--]; 00375 insertions++; 00376 } 00377 } 00378 break; 00379 00380 case GLOBAL: 00381 final = i!=0 || j!=0; 00382 break; 00383 00384 case LOCAL: 00385 final = m[W*j+i]!=0; 00386 break; 00387 } 00388 } 00389 00390 00391 // Computing of the score 00392 // Score= avg.points/phoneme 00393 steps = lastChar; 00394 00395 switch(mode){ 00396 case GLOBAL: 00397 // average of similarity per phonem (without spaces?) 00398 score /= (pA-spacesA); 00399 break; 00400 case SEMILOCAL: 00401 //if(debug>2) cerr << " Score = " << score << " Last Score = " << lastScore << " steps = " << steps << endl; 00402 //score = (score-lastScore) / steps; 00403 score = score / steps; 00404 break; 00405 case LOCAL: 00406 score = score / steps; 00407 break; 00408 } 00409 00410 // Print the alignment 00411 wchar_t* newA = new wchar_t[pA+1]; 00412 wchar_t* newB = new wchar_t[pB+1]; 00413 newA[pA]=0; 00414 newB[pB]=0; 00415 --pA; 00416 --pB; 00417 00418 for(int i=0; pA>=0; ++i, --pA){ 00419 newA[i] = answerA[pA]; 00420 } 00421 for(int i=0; pB>=0; ++i, --pB){ 00422 newB[i] = answerB[pB]; 00423 } 00424 00425 delete[](answerA); 00426 delete[](answerB); 00427 delete[] m; 00428 00429 // Finds in wich word does indexInit and indexEnd fall 00430 for(int i=0; i<nwords; i++){ 00431 if( words[i] == indexInit ){ 00432 initWord = i+1; 00433 break; 00434 } else if( words[i] < indexInit ) { 00435 initWord = i+1; 00436 } else if( words[i] > indexInit ) { 00437 initWord = i; 00438 break; 00439 } 00440 00441 } 00442 00443 for(int i=0; i<nwords; i++){ 00444 if( words[i] == indexEnd ){ 00445 endWord = i; 00446 break; 00447 } else if( words[i] < indexEnd ) { 00448 endWord = i+1; 00449 } else if( words[i] > indexEnd ) { 00450 endWord = i; 00451 break; 00452 } 00453 00454 } 00455 00456 delete[] words; 00457 00458 return new alin(score,indexInit,indexEnd,initWord,endWord,substitutions,insertions,deletions,newA,newB); 00459 00460 } 00461 00462 }; 00463 00464 00465 #endif
1.7.6.1