FreeLing  3.0
aligner.h
Go to the documentation of this file.
00001 /*
00002  * String aligner for the Phonetic Distance Package
00003  * + 
00004  * + April 2006 pcomas@lsi.upc.edu
00005  *
00006  */
00007 #ifndef _aligner_h
00008 #define _aligner_h
00009 
00010 #include <iostream>
00011 #include <fstream>
00012 #include <sstream>
00013 #include <map>
00014 #include <set>
00015 #include <string>
00016 #include <math.h>
00017 
00018 #include "freeling/morfo/phd.h"
00019 
00020 #define GLOBAL    1  // The aligner produces local alignments
00021 #define SEMILOCAL 2  // The aligner produces semilocal alignments
00022 #define LOCAL     3  // The aligner produces global alignments
00023 
00024 
00025 template<typename T=int> 
00026   class aligner{
00027 
00028  private:
00029  phd<T>* sc;
00030  T score;
00031  int debug;
00032 
00033  public:
00034 
00035  struct alin {
00036    T score;   // score of this alignment
00037    double scoren; // score normalized
00038    double context;
00039    int Psubstitutions;
00040    int Pinserts;
00041    int Pdeletions;
00042    int Wsubstitutions;
00043    int Winserts;
00044    int Wdeletions;
00045    unsigned int kword; //The number of keyword involved in this alignment
00046    int begin; // index in b[] where the alignment starts
00047    int end;   // index in b[] where the alignment ends
00048    int beginW; //number of word in b[] where the alignment starts (each ' ' changes the word)
00049    int endW;   //number of word in b[] where the alignment ends
00050    wchar_t* seg; // Segment of b[] with the letters
00051    wchar_t* a;   // Alignment of a
00052    wchar_t* b;   // Alignment of b
00053    bool good; // Marks if this alignment is selected as relevant
00054 
00055    ~alin(){
00056      delete[](a);
00057      delete[](b);
00058      delete[](seg);
00059    }
00060 
00061  alin(T _score, int _begin, int _end, int _beginW, int _endW, int _substitutions, int _inserts, int _deletions, wchar_t* _a, wchar_t* _b) :
00062    score(_score), Psubstitutions(_substitutions), Pinserts(_inserts), Pdeletions(_deletions), begin(_begin), end(_end), beginW(_beginW), endW(_endW), seg(NULL), a(_a), b(_b) {}
00063  };
00064 
00065 
00066  aligner(const std::wstring &fname, int const _debug = 0){
00067    sc = new phd<T>(fname);
00068    debug=_debug;
00069    // if(debug>3){ sc->show(cerr);}
00070  } //constructor
00071 
00072  ~aligner(){
00073    delete(sc);
00074  }
00075 
00076 
00077  /*
00078   * Alinia la cadena A contra la cadena B. Conceptualment es considera que A és la query
00079   */
00080  alin* align(const std::wstring &wa, const std::wstring &wb, const int mode = SEMILOCAL){
00081    // a is the short string with length tj
00082    // b is the long string with length ti
00083    // The algorithm searchs for the best match of a against b in a semi-local or global point of view.
00084    // Usage of global matching is discouraged for the sake of coherence :-) because it doesn't try
00085    // to keep the letters together so 'a' will be meaningless scattered through all 'b'
00086 
00087    const wchar_t *a=wa.c_str(); const int tj=wa.size();
00088    const wchar_t *b=wb.c_str(); const int ti=wb.size();
00089 
00090    /*
00091    //  Build the align matrix
00092    */
00093    int const W = ti+1;
00094 
00095    if( ti==0 || tj == 0 ){ return new alin(0,0,0,0,0,0,0,0,0,0); }
00096    int i,j;
00097    int* m = new int[W*(tj+1)+ti+1]; //int m[tj+1][ti+1];
00098 
00099    // Variables for alignment reconstruction
00100    wchar_t* answerA = new wchar_t[tj+ti];
00101    wchar_t* answerB = new wchar_t[tj+ti];
00102    int pA = 0;
00103    int pB = 0;
00104    int insertions=0, deletions=0, substitutions=0;
00105    int spacesA=0;
00106 
00107 
00108    // INITIALIZATION STEP
00109 
00110    // Decide the number of word that each character belongs to
00111    int nwords=0;
00112    for(int i=0; i<ti; i++){ 
00113      if( b[i] == L' ' || b[i] == L'_' ){
00114        // word change at _i_
00115        nwords++;
00116      }
00117    }
00118 
00119    int* words = new int[nwords];
00120    j=0;
00121    for(int i=0; i<ti; i++){ 
00122      if( b[i] == L' ' || b[i] == L'_' ){
00123        words[j++] = i;
00124      }
00125    }
00126 
00127    switch(mode){
00128    case GLOBAL:  //Start values are skips
00129    m[0]=0;
00130    for(int j=1; j<=tj; j++){ m[j*W] = m[(j-1)*W] + sc->dSkip(a[j-1]); }
00131    for(int i=1; i<=ti; i++){ m[i]   = m[i-1]     + sc->dSkip(b[i-1]); }
00132    break;
00133    default: //Start values are 0
00134    for(int j=0;j<=tj;j++){ m[j*W] = 0; }
00135    for(int i=1;i<=ti;i++){ m[i] = 0; }
00136    break;
00137    }
00138 
00139 
00140    /*
00141    // Throw the scoring process
00142    */
00143    int i1, i2, i3, i4, i5, indexInit, indexEnd, bestJ, bestI;
00144    indexEnd  = 0;
00145    indexInit = 0;
00146    int initWord  = 0;
00147    int endWord   = 0;
00148    score = -100000000;
00149    bestJ = tj;
00150    bestI = ti;
00151 
00152    // FILL THE FIRST COLUMN & ROW
00153    for(j=1;j<=tj;j++){
00154      m[j*W+1] = std::max(  m[W*(j-1)]+sc->dSub(a[j-1],b[0]) , m[W*(j-1)+1]+sc->dSkip(a[j-1]) );
00155      if(a[j-1]==L' '|| a[j-1]==L'_' ){spacesA++;}
00156      if(score < m[W*j+1]){
00157        score = m[W*j+1];
00158        bestJ = j;
00159        bestI = 0;
00160      }
00161    }
00162    for(i=1;i<=ti;i++){
00163      m[ W+i ] = std::max(  m[ i-1 ]+sc->dSub(a[0],b[i-1])  ,  m[ W+i-1 ]+sc->dSkip(b[i-1])  );
00164      if(score < m[W+i]){
00165        score = m[W+i];
00166        bestJ = 0;
00167        bestI = i;
00168      }
00169    }
00170 
00171    //FILL THE REST OF THE MATRIX
00172    //int step = ti/4;
00173    int lowLimit = 0;
00174    int upperLimit = 0;
00175    int threshold = -10000000;
00176    if( mode == LOCAL ){ threshold = 0; }
00177 
00178    for(j=2;j<=tj;j++){
00179      //if( mode==GLOBAL && tj/ti> 0.75){
00180      // // If the scoring is GLOBAL we do not fill the entire matrix
00181      // lowLimit   = max(  2 , j-step );
00182      // upperLimit = min( ti , j+step );
00183      // m[W*j+lowLimit+1] = 0;
00184      //      } else {
00185      lowLimit   = 2;
00186      upperLimit = ti;
00187      //      }
00188 
00189      for(i=lowLimit;i<=upperLimit;i++){
00190        i1 = m[W*(j-1)+i-1] + sc->dSub(a[j-1],b[i-1]);
00191        i2 = m[W*j+i-1] + sc->dSkip(b[i-1]);
00192        i3 = m[W*(j-1)+i-2] + sc->dExp(a[j-1],b[i-2],b[i-1]);
00193        i4 = m[W*(j-1)+i] + sc->dSkip(a[j-1]);
00194        i5 = m[W*(j-2)+i-1] + sc->dExp(b[i-1],a[j-2],a[j-1]);
00195        m[W*j+i] = std::max(threshold,std::max(i1,std::max(i1,std::max(i2,std::max(i3,std::max(i4,i5))))));
00196 
00197        if(score < m[W*j+i]){
00198          score = m[W*j+i];
00199          bestJ = j;
00200          bestI = i;
00201        }
00202 
00203      }
00204    }
00205 
00206    //Prints the full score matrix if needed
00207    /*if(debug>5){
00208      cerr << endl << "\t";
00209      for(i=0;i<ti;i++){ cerr << "\t" << b[i]; }
00210      cerr << endl << "\t";
00211      for(j=0;j<=tj;j++){
00212      if(j>0) cerr << a[j-1] << "\t";
00213      for(i=0;i<=ti;i++){
00214      cerr << m[W*j+i] << "\t";
00215      }
00216      cerr << endl;
00217      }
00218      }*/
00219 
00220 
00221    /*
00222      || Reconstruction Step
00223    */
00224    bool final = true;    //Exit condition
00225    int lastScore = score;
00226    int steps = 0; //Number of steps uset to align 'a' with 'b'
00227    int lastChar = 0;
00228 
00229 
00230    int ni=0;
00231    int mymax=0;
00232 
00233    switch(mode){
00234    case SEMILOCAL:
00235      /* In semi-local alignment, the best-path reconstructions doesn't start from the 
00236       * botom-right corner of the matrix but from the best-scoring cell among last
00237       * column and last row scores.
00238       * In this implementation only the last row is taken in account since it is 
00239       * the "long" string
00240       */
00241      for(i=0;i<=ti;i++){
00242        if( mymax < m[W*tj+i] ){
00243          mymax = m[W*tj+i];
00244          ni=i;
00245        }
00246      }
00247      score = mymax;
00248      for(i=ti-1;i>=ni;i--){ // Carry on half of the string for debugging purpouses
00249        answerA[pA++]= L'-';
00250        answerB[pB++]= b[i];
00251        insertions++;
00252      }
00253      i=ni; j=tj;
00254      break;
00255    case GLOBAL: // now this is global alignment
00256      i = ti; j = tj;
00257      break;
00258    case LOCAL: 
00259      /* In LOCAL alignment, the best-path reconstruction starts with at the best scoring 
00260       * position in the whole matrix.
00261       */
00262      j = bestJ;
00263      i = bestI;
00264      break;
00265    }
00266 
00267 
00268    while( final ){
00269 
00270      /*if(debug>5){
00271        cerr << "Looking for " << m[ W*j +i] << " in ("<<j<<","<<i<<")"<<endl;
00272        cerr << "m[j-1][i-1]=" << m[ W*(j-1) +i-1] << ": dSub=" << sc->dSub(a[j-1],b[i-1]) << endl;
00273        cerr << "m[j][i-1]=" <<   m[ W*j +i-1] << ": dSkipB=" << sc->dSkip(b[i-1]) << endl;
00274        cerr << "m[j-1][i]=" <<   m[ W*(j-1) +i] << ": dSkipA=" << sc->dSkip(a[j-1]) << endl;
00275        if(j>1){ cerr << "m[j-1][i-2]=" << m[ W*(j-1) +i-2] << ": dExp=" << sc->dExp(a[j-1],b[i-2],b[i-1]) << endl; }
00276        if(i>1){ cerr << "m[j-2][i-1]=" << m[ W*(j-2) +i-1] << ": dExp=" << sc->dExp(b[i-1],a[j-2],a[j-1]) << endl; }
00277        }*/
00278 
00279 
00280      if( j>0 && i>0 && m[W*j+i] ==  m[W*(j-1)+i-1] + sc->dSub(a[j-1],b[i-1]) ){
00281        //Aliniar b[i] amb a[j]
00282        lastScore = sc->dSub(a[j-1],b[i-1]);
00283        i--; j--;
00284        //cerr << "Sub a[" << j << "]=" << a[j] << " b[" << i << "]=" << b[i] << " @ " << pA << endl;
00285        answerA[pA++] = a[j];
00286        answerB[pB++] = b[i];
00287        indexInit= i;
00288        indexEnd = std::max(i,indexEnd);
00289        if( a[j]!=b[i] ){ substitutions++;  }
00290        steps++;
00291        lastChar = steps;
00292 
00293      } else if ( i>0 && j>0 && m[W*j+i] == m[W*j+i-1] + sc->dSkip(b[i-1]) ) {
00294        lastScore = sc->dSkip(b[i-1]);
00295        i--;
00296        answerA[pA++] = L'-';
00297        answerB[pB++] = b[i];
00298        insertions++;
00299        if(steps==0 && b[i]==L' ' && b[i]==L'_' ){
00300          score = m[W*j+i];
00301        } else {
00302          steps++;
00303        }
00304 
00305      } else if( j>1 && i>0 && m[W*j+i] ==  m[W*(j-2)+i-1] + sc->dExp(b[i-1],a[j-2],a[j-1]) ){
00306        lastScore =  sc->dExp(b[i-1],a[j-2],a[j-1]);
00307        i--; j-=2;
00308        answerA[pA++] = a[j];
00309        answerA[pA++] = a[j+1];
00310        answerB[pB++] = b[i];
00311        answerB[pB++] = L'+';
00312        indexInit = i;
00313        indexEnd = std::max(i,indexEnd);
00314        deletions++;
00315        if( a[j]!=b[i] ){ substitutions++; }
00316        steps+=2;
00317        lastChar = steps;
00318 
00319      } else if( j>0 && i>0 && m[W*j+i] ==  m[W*(j-1)+i] + sc->dSkip(a[j-1]) ){
00320        lastScore = sc->dSkip(a[j-1]);
00321        j--;
00322        answerA[pA++] = a[j];
00323        answerB[pB++] = L'-';
00324        indexInit = i;
00325        indexEnd = std::max(i,indexEnd);
00326        deletions++;
00327        steps++;
00328        lastChar = steps;
00329 
00330      } else if ( i>1 && j>0 && m[W*j+i] == m[ W*(j-1) +i-2 ] + sc->dExp(a[j-1],b[i-2],b[i-1]) ) {
00331        lastScore = sc->dExp(a[j-1],b[i-2],b[i-1]);
00332        j--; i--;
00333        answerA[pA++] = a[j];
00334        answerA[pA++] = L'+';
00335        answerB[pB++] = b[i];
00336        answerB[pB++] = b[i-1];
00337        indexInit = i;
00338        indexEnd = std::max(i,indexEnd);
00339        insertions++;
00340        if( a[j]!=b[i] ){ substitutions++; }
00341        i--;
00342        steps+=2;
00343        lastChar = steps;
00344 
00345      } else if ( j==0 ){
00346        i--;
00347        answerA[pA++] = L'-';
00348        answerB[pB++] = b[i];
00349        insertions++;
00350        if(steps!=0 || b[i]!=L' ' || b[i]!=L'_' ) steps++;
00351 
00352      } else if ( i==0 ){
00353        j--;
00354        answerA[pA++] = a[j];
00355        answerB[pB++] = L'-';
00356        deletions++;
00357 
00358        if(steps!=0 || b[i]!=L' ' || b[i]!=L'_' ) steps++;
00359        lastChar = steps;
00360         
00361      } else {
00362        std::wcerr << L"BOINK! Error at "<< j << L"," << i << std::endl;
00363        break;
00364      }
00365       
00366      // This is the termination condition
00367      switch(mode){
00368      case SEMILOCAL:
00369      final = j!=0;
00370      if(!final){
00371        i--;
00372        while(i>=0){
00373          answerA[pA++] = L'-';
00374          answerB[pB++] = b[i--];
00375          insertions++;
00376        }
00377      }
00378      break;
00379 
00380      case GLOBAL:
00381      final = i!=0 || j!=0;
00382      break;
00383 
00384      case LOCAL:
00385      final = m[W*j+i]!=0;
00386      break;
00387      }
00388    }
00389 
00390 
00391    // Computing of the score
00392    // Score= avg.points/phoneme
00393    steps = lastChar;
00394 
00395    switch(mode){
00396    case GLOBAL:
00397      // average of similarity per phonem (without spaces?)
00398      score /= (pA-spacesA);
00399      break;
00400    case SEMILOCAL:
00401      //if(debug>2) cerr << "     Score = " << score << " Last Score = " << lastScore << " steps = " << steps <<  endl;
00402      //score = (score-lastScore) / steps;
00403      score = score / steps;
00404      break;
00405    case LOCAL:
00406      score = score / steps;
00407      break;
00408    }
00409 
00410    // Print the alignment
00411    wchar_t* newA = new wchar_t[pA+1];
00412    wchar_t* newB = new wchar_t[pB+1];
00413    newA[pA]=0;
00414    newB[pB]=0;
00415    --pA;
00416    --pB;
00417 
00418    for(int i=0; pA>=0; ++i, --pA){
00419      newA[i] = answerA[pA];
00420    }
00421    for(int i=0; pB>=0; ++i, --pB){
00422      newB[i] = answerB[pB];
00423    }
00424 
00425    delete[](answerA);
00426    delete[](answerB);
00427    delete[] m;
00428 
00429    // Finds in wich word does indexInit and indexEnd fall
00430    for(int i=0; i<nwords; i++){ 
00431      if( words[i] == indexInit ){
00432        initWord = i+1;
00433        break;
00434      } else if( words[i] < indexInit ) {
00435        initWord = i+1;
00436      } else if( words[i] > indexInit ) {
00437        initWord = i;
00438        break;
00439      }
00440 
00441    }
00442 
00443    for(int i=0; i<nwords; i++){ 
00444      if( words[i] == indexEnd ){
00445        endWord = i;
00446        break;
00447      } else if( words[i] < indexEnd ) {
00448        endWord = i+1;
00449      } else if( words[i] > indexEnd ) {
00450        endWord = i;
00451        break;
00452      }
00453 
00454    }
00455 
00456    delete[] words;
00457 
00458    return new alin(score,indexInit,indexEnd,initWord,endWord,substitutions,insertions,deletions,newA,newB);
00459 
00460  }
00461 
00462 };
00463 
00464 
00465 #endif