(課程筆記)(NTUST) Information Retrieval

HW1 - Vector Space Model

TF、IDF計算方式:

一開始會讀取document的資料前處理，先去做一些normalized，讓lexicon不要有重複的term，而為了讓程式能在1分鐘內跑完，lexicon使用c++的unordered_map下去存，unordered_map使用hash function找到key term，用來尋找及修改object有著極高的效率。有了normalized後的資料，就可以計算每個term在每個檔案中的TF (local)，接著算出每個term的IDF (global)。TF: 該index Term在該Doc下出現的次數。IDF: 該index Term 在所有Doc下出現的文章次數。

Document Term Weight、Query Term Weight計算方式:

有了TF、IDF之後，就能算出Document Term Weight及Query Term Weight。這裡使用的方法是老師講義中的Scheme3，會有較佳的score。
左為:doc term 右為:query term

Vector Space Model運作原理及參數調整:

利用兩相同長度的向量(doc weight vector、query weight vector)求出cosine-similarity(兩向量之夾角)，此cosine-similarity值就代表ranking，Ranking值的範圍0 ≤ 𝑠𝑖𝑚 𝑞, 𝑑𝑗 ≤ 1。Ranking值高代表此doc與query的關聯性更高，排名會比其他ranking低的doc前面。而關於Vector Space Model參數的調整，我在老師的scheme3上進行調整，把query weight constant值改1.5、doc weight constant改2.5，有著較佳的結果。

Code

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311


# include <fstream>
# include <string>
# include <vector>
# include <iostream>
# include <algorithm>
# include <io.h>
# include <cmath>
# include <unordered_map>
using namespace std;

struct TermProperty {
  int tf[4190] = { 0 };
  int num_of_article_including;
  double idf;
  double weight;
}; // TermProperty

# define MAX_NUM_DOCS 4191.0

struct Dictionary {
  string term_name;
  string term_in_last_file_name;
  TermProperty term_property;
  bool operator==(const Dictionary& p) {
    return this->term_name == p.term_name;
  } // bool

  inline friend std::ostream& operator<<(std::ostream& os, Dictionary& p) {
    os << p.term_name;
    return os;
  } 

}; // Dictionary

struct Document {
  string doc_name;
  vector<string> all_term_names_in_file;
  vector<double> all_term_weight_in_file;
  double ranking;
}; // Document

struct Query {
  string file_name;
  vector<int> tf;
  vector<string> term_names;
  vector<double> term_weights;
}; // Query

struct compare_key {

    inline bool operator() (const Document& struct1, const Document& struct2) {
      if ( struct1.ranking == struct2.ranking && struct1.doc_name > struct2.doc_name ) {
        return 1;
      } // if
      else if ( struct1.ranking == struct2.ranking && struct1.doc_name < struct2.doc_name ) {
        return 0;
      } // else if

      return ( struct1.ranking > struct2.ranking );
    } // 
};

class Doc_Scanner {
  public:
    unordered_map<string, Dictionary> doc_terms_map;
    vector<Document> all_docs;
    int num_file;
    vector<Query> all_queries;
    
    void ReadAllDocuments() {
      string doc_files_path = "./docs/*.txt"; 
      struct _finddata_t file_info;
      int handle = _findfirst( doc_files_path.c_str(), &file_info );
      int k = handle;
      if ( handle == -1 ) {
        cout << "Read Docs Error. Please check your docs path.\n";
      } // if
      else {
        num_file = 0;
        while ( k != -1 ) {
          string file_name = file_info.name;
          Document document;
          document.doc_name = file_name;
          document.ranking = -1;
          document.all_term_names_in_file = read_a_doc_to_dictionary( file_name );
          all_docs.push_back( document );
          num_file++;
          k = _findnext( handle, &file_info);
        } // while
      } // else

      _findclose( handle );
  } // ReadAllDocuments()

    void CalculateDocsTermWeight() {
      for ( int i = 0; i < all_docs.size(); i++ ) {
        for ( int j = 0; j < all_docs[i].all_term_names_in_file.size(); j++ ) {
          auto iterator = doc_terms_map.find( all_docs[i].all_term_names_in_file[j] );
          Dictionary doc_term = iterator -> second; 
          doc_term.term_property.idf = MAX_NUM_DOCS / doc_term.term_property.num_of_article_including;
          doc_term.term_property.weight = ( doc_term.term_property.tf[i] + 2.5 ) * log10( doc_term.term_property.idf );
          all_docs[i].all_term_weight_in_file.push_back( doc_term.term_property.weight );
          iterator -> second = doc_term;
        } // for

      } // for

    } // CalculateDocsTermWeight()

    void CalculateEveryQueryFile() {
      string query_files_path = "./queries/*.txt"; 
      struct _finddata_t file_info;
      int handle = _findfirst( query_files_path.c_str(), &file_info );
      int k = handle;
      while ( k != -1 ) {
        string file_name = file_info.name;
        calculate_query_term_weight( file_name );
        k = _findnext( handle, &file_info );
      } // while

    } // RankEveryQuery()

    void RankEveryQueryFile( ofstream & output_file ) {
      output_file << "Query,RetrievedDocuments\n";
      for( int i = 0; i < all_queries.size(); i++ ) {
        query_file = all_queries[i];
        for( int j = 0; j < all_docs.size(); j++ ) {
          all_docs[j].ranking = rank_doc( all_docs[j] );
        } // for

        sort( all_docs.begin(), all_docs.end(), compare_key() );
        print_ranking( output_file );
      } // for

    } // RankEveryQueryFile()


  private:
    Query query_file;
    vector<double> query_weights;
    vector<double> doc_weights;
    
    void calculate_query_term_weight( string query_file_name ) {
      fstream file;
      string term_name;
      string path_name = "./queries/" + query_file_name;
      Query query;
      double weight;
      query.file_name = query_file_name;
      file.open( path_name, ios::in );
      while ( file.peek() != EOF ) {
        file >> term_name;
        auto iterator = doc_terms_map.find( term_name );
        if ( iterator != doc_terms_map.end() ) {
          Dictionary doc_term = iterator -> second;
          weight = 2.5 * log10( doc_term.term_property.idf );
          query.term_weights.push_back( weight );
          query.term_names.push_back( term_name );
        } // if

      } // while

      file.close();
      all_queries.push_back( query );
    } // calculate_query_term_weight()

    double rank_doc( Document doc ) {
      vector<double> temp_query_weights;
      vector<double> temp_doc_weights = doc.all_term_weight_in_file;
      int i;
      for ( i = 0; i < doc.all_term_names_in_file.size(); i++ ) {
        temp_query_weights.push_back( 0.0 );
      } // for

      for ( i = 0; i < query_file.term_names.size(); i++ ) {
        auto iterator = find( doc.all_term_names_in_file.begin(), doc.all_term_names_in_file.end(), query_file.term_names[i] );
        if ( iterator == doc.all_term_names_in_file.end() ) {
          temp_query_weights.push_back( query_file.term_weights[i] );
          temp_doc_weights.push_back( 0.0 );
        } // if
        else {
          int pos = distance( doc.all_term_names_in_file.begin(), iterator );
          double relpace_weight = query_file.term_weights[i];
          temp_query_weights[pos] = relpace_weight;
        } // else

      } // for

      query_weights = temp_query_weights;
      doc_weights = temp_doc_weights;
      return cosine_similarity();
    } // rank_doc()

    double cosine_similarity() {
      // cout << query_weights.size() << " " <<  doc_weights.size() << "\n";
      double dot = 0.0, denom_query = 0.0, denom_doc = 0.0;
      for ( int i = 0; i < doc_weights.size(); i++ ) {
        dot = dot + query_weights[i] * doc_weights[i];
        denom_query = denom_query + query_weights[i] * query_weights[i];
        denom_doc = denom_doc + doc_weights[i] * doc_weights[i];
      } // for()

      // cout << dot << " " << denom_query << " " << denom_doc << "\n";
      return dot / ( sqrt( denom_query ) * sqrt( denom_doc ) );
    } // cosine_similarity()

    int partition ( int low, int high ) { 
      double pivot = all_docs[high].ranking;    // pivot 
      int i = ( low - 1 );   
    
      for (int j = low; j < high; j++) { 
        if ( all_docs[j].ranking > pivot ) {
          i++;    // increment index of smaller element 
          Document doc;
          doc = all_docs[j];
          all_docs[j] = all_docs[i];
          all_docs[i] = doc;
        } // if

      } // for

      i++;
      Document doc;
      doc = all_docs[high];
      all_docs[high] = all_docs[i];
      all_docs[i] = doc;
      return i; 
    } // partition()

    void descending_order_document_ranking( int low, int high) {
      double pivot = partition( low, high );
      descending_order_document_ranking( low, pivot -1 );
      descending_order_document_ranking( pivot + 1, high );
    } // descending_order_document_ranking()
    

    void print_ranking( ofstream & output_file ) {
      string query_name = query_file.file_name;
      query_name.erase( query_name.end() -4, query_name.end() );
      output_file << query_name << ",";
      for( int i = 0; i < all_docs.size(); i++ ) {
        string term_name = all_docs[i].doc_name;
        term_name.erase( term_name.end() -4, term_name.end() );
        output_file << term_name << " ";
      } // for

      output_file << "\n";
    } // print_ranking()

    vector<string> read_a_doc_to_dictionary( string file_name ) {
      fstream file;
      string term_name;
      string path_name = "./docs/" + file_name;
      vector<string> term_names;
      file.open( path_name, ios::in );
      while ( file.peek() != EOF ) {
        file >> term_name;
        term_name = add_index_term_to_dictionary( term_name, file_name );
        if ( term_name != "" ) term_names.push_back( term_name );
      } // while

      file.close();
      return term_names;
    } // read_a_doc_to_dictionary()

    string add_index_term_to_dictionary( string term_name, string file_name ) {
      auto iterator = doc_terms_map.find( term_name );
      if ( iterator != doc_terms_map.end() ) {
        Dictionary doc_term = iterator -> second;
        doc_term.term_property.tf[num_file]++;
        if ( doc_term.term_in_last_file_name != file_name ) {
          doc_term.term_property.num_of_article_including++;
          doc_term.term_in_last_file_name = file_name;
        } // if
        else {
          term_name = "";
        } // else

        iterator -> second = doc_term;
      } // if
      else {
        TermProperty term_property;
        term_property.tf[num_file] = 1;
        term_property.num_of_article_including = 1;
        Dictionary doc_term;
        doc_term.term_name = term_name;
        doc_term.term_in_last_file_name = file_name;
        doc_term.term_property = term_property;
        doc_terms_map.insert( make_pair( term_name, doc_term ));
      } // else

      return term_name;
    } // add_index_term_to_dictionary()


}; // Doc_Scanner

int main() {
  cout << "Start scanning documents.\n";
  Doc_Scanner doc_scanner;
  doc_scanner.ReadAllDocuments();
  doc_scanner.CalculateDocsTermWeight();
  cout << "Scanning documents done.\n";
  cout << "Start Calulate Query Weight\n";
  // PrintVector( doc_terms );
  doc_scanner.CalculateEveryQueryFile();
  cout << "End Calulate Query Weight\n";
  ofstream output_file( "output_csv.csv" );
  doc_scanner.RankEveryQueryFile( output_file );
  cout << "End Program\n";
} // main()

HW2 - Best Match Model

Best Match 25L

一開始使用BM25L進行實作，經過多番嘗試參數，發現其值最多只能到0.70多，後改用BM25能獲得較高的數值。而BM25L改善了其long documents的問題，間接避免了overly-penalizing，shift parameter 𝛿 = 0.5能獲得較佳的結果。

Best Match 25原理及應用

Best Match 25一樣使用了TF 及 IDF，與the vector model不同的是BM25多加了一項document length normalization的principle，且BM25為BM15及BM11的組合，當 b = 0時，為BM15，b = 1時，為BM11，藉由調控b參數來獲得較佳的結果。

做完也發現BM25有著比the vector model更好的表現，需要的運算時間較少且精準度更高(無須做cosine-similarity是一大優勢)，但可調節的參數需要自行設定，且doument length太長時會有精準度較差的表現。

Best Match 25參數

在程式中第137~ 139行中(RankEveryQueryFile)分別設定了k1、k3及b值，獲得最佳參數為以下值，其中k3是隨意值，因query file中的term，tf(I,q)皆 = 1

K1 = 1.635 b = 0.85 k3 = 1000.0

藍色項: Document Length Normalization 綠色項: Inverse Document Frequency
中間項: Inverse Query Frequency可省略(在HW2中)

Code

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279


# include <fstream>
# include <string>
# include <vector>
# include <iostream>
# include <algorithm>
# include <io.h>
# include <cmath>
# include <unordered_map>
using namespace std;

struct TermProperty {
  int tf[4190] = { 0 };
  int num_of_article_including;
  double idf;
  double weight;
}; // TermProperty

# define MAX_NUM_DOCS 4191.0

struct Dictionary {
  string term_name;
  string term_in_last_file_name;
  TermProperty term_property;
  bool operator==(const Dictionary& p) {
    return this->term_name == p.term_name;
  } // bool

  inline friend std::ostream& operator<<(std::ostream& os, Dictionary& p) {
    os << p.term_name;
    return os;
  } 

}; // Dictionary

struct Document {
  string doc_name;
  float doc_length;
  vector<string> all_term_names_in_file;
  vector<double> all_term_tf_in_file;
  vector<double> all_term_ni_in_file;
  vector<double> all_term_weight_in_file;
  double ranking;
}; // Document

struct Query {
  string file_name;
  vector<int> tf;
  vector<string> term_names;
  vector<double> term_weights;
}; // Query

struct compare_key {

    inline bool operator() (const Document& struct1, const Document& struct2) {
      if ( struct1.ranking == struct2.ranking && struct1.doc_name > struct2.doc_name ) {
        return 1;
      } // if
      else if ( struct1.ranking == struct2.ranking && struct1.doc_name < struct2.doc_name ) {
        return 0;
      } // else if

      return ( struct1.ranking > struct2.ranking );
    } // 
};

class Doc_Scanner {
  public:
    unordered_map<string, Dictionary> doc_terms_map;
    vector<Document> all_docs;
    float avg_doclen;
    int num_file;
    vector<Query> all_queries;
    
    void ReadAllDocuments() {
      string doc_files_path = "./docs/*.txt"; 
      struct _finddata_t file_info;
      int handle = _findfirst( doc_files_path.c_str(), &file_info );
      int k = handle;
      float doc_length = 0.0;
      avg_doclen = 0.0;
      if ( handle == -1 ) {
        cout << "Read Docs Error. Please check your docs path.\n";
      } // if
      else {
        num_file = 0;
        while ( k != -1 ) {
          string file_name = file_info.name;
          Document document;
          document.doc_name = file_name;
          document.ranking = -1;
          document.all_term_names_in_file = read_a_doc_to_dictionary( file_name, doc_length );
          document.doc_length = doc_length;
          avg_doclen += doc_length;
          all_docs.push_back( document );
          num_file++;
          k = _findnext( handle, &file_info);
          doc_length = 0.0;
        } // while
      } // else

      avg_doclen = avg_doclen / MAX_NUM_DOCS;
      _findclose( handle );
  } // ReadAllDocuments()

    void CalculateDocsTermWeight() {
      for ( int i = 0; i < all_docs.size(); i++ ) {
        for ( int j = 0; j < all_docs[i].all_term_names_in_file.size(); j++ ) {
          auto iterator = doc_terms_map.find( all_docs[i].all_term_names_in_file[j] );
          Dictionary doc_term = iterator -> second; 
          doc_term.term_property.idf = MAX_NUM_DOCS / doc_term.term_property.num_of_article_including;
          doc_term.term_property.weight = ( doc_term.term_property.tf[i] + 2.5 ) * log10( doc_term.term_property.idf );
          all_docs[i].all_term_tf_in_file.push_back( doc_term.term_property.tf[i] );
          all_docs[i].all_term_ni_in_file.push_back( doc_term.term_property.num_of_article_including );
          all_docs[i].all_term_weight_in_file.push_back( doc_term.term_property.weight );
          iterator -> second = doc_term;
        } // for

      } // for

    } // CalculateDocsTermWeight()

    void CalculateEveryQueryFile() {
      string query_files_path = "./queries/*.txt"; 
      struct _finddata_t file_info;
      int handle = _findfirst( query_files_path.c_str(), &file_info );
      int k = handle;
      while ( k != -1 ) {
        string file_name = file_info.name;
        calculate_query_term_weight( file_name );
        k = _findnext( handle, &file_info );
      } // while

    } // RankEveryQuery()

    void RankEveryQueryFile( ofstream & output_file ) {
      output_file << "Query,RetrievedDocuments\n";
      k1 = 1.635;
      b = 0.85;
      k3 = 1000.0;
      for( int i = 0; i < all_queries.size(); i++ ) {
        query_file = all_queries[i];
        for( int j = 0; j < all_docs.size(); j++ ) {
          all_docs[j].ranking = rank_doc_BM25( all_docs[j] );
        } // for

        sort( all_docs.begin(), all_docs.end(), compare_key() );
        output_ranking( output_file );
      } // for

    } // RankEveryQueryFile()


  private:
    Query query_file;
    vector<double> query_weights;
    vector<double> doc_weights;
    double k1, b, k3;
    
    void calculate_query_term_weight( string query_file_name ) {
      fstream file;
      string term_name;
      string path_name = "./queries/" + query_file_name;
      Query query;
      double weight;
      query.file_name = query_file_name;
      file.open( path_name, ios::in );
      while ( file.peek() != EOF ) {
        file >> term_name;
        auto iterator = doc_terms_map.find( term_name );
        if ( iterator != doc_terms_map.end() ) {
          Dictionary doc_term = iterator -> second;
          weight = 2.5 * log10( doc_term.term_property.idf );
          query.term_weights.push_back( weight );
          query.term_names.push_back( term_name );
        } // if

      } // while

      file.close();
      all_queries.push_back( query );
    } // calculate_query_term_weight()

    double rank_doc_BM25( Document doc ) {
      double doc_score = 0.0, doc_temp = 0.0;
      for( int i = 0; i < query_file.term_names.size(); i++ ) {
        auto it = find( doc.all_term_names_in_file.begin(), doc.all_term_names_in_file.end(), query_file.term_names[i] );
        if ( it != doc.all_term_names_in_file.end() ) {
          int pos = it - doc.all_term_names_in_file.begin();
          doc_temp = ( ( ( k1 + 1.0 ) * doc.all_term_tf_in_file[pos] ) / ( k1 * ( ( 1 - b) + b * ( doc.doc_length / avg_doclen ) ) + doc.all_term_tf_in_file[pos] ) );
          doc_temp = doc_temp * ( ( k3 + 1.0 ) * 1.0 / ( k3 + 1.0 ) );
          auto iterator = doc_terms_map.find( query_file.term_names[i] );
          Dictionary doc_term = iterator -> second; 
          doc_temp = doc_temp * log10( ( MAX_NUM_DOCS - doc_term.term_property.num_of_article_including + 0.5 ) / ( doc_term.term_property.num_of_article_including + 0.5 ) );
        } // if

        doc_score = doc_score + doc_temp;
        doc_temp = 0.0;
      } // for()

      
      return doc_score;
    } // rank_doc_BM25()

    void output_ranking( ofstream & output_file ) {
      string query_name = query_file.file_name;
      query_name.erase( query_name.end() -4, query_name.end() );
      output_file << query_name << ",";
      for( int i = 0; i < all_docs.size(); i++ ) {
        string term_name = all_docs[i].doc_name;
        term_name.erase( term_name.end() -4, term_name.end() );
        output_file << term_name << " ";
      } // for

      output_file << "\n";
    } // output_ranking()

    vector<string> read_a_doc_to_dictionary( string file_name, float & doc_length ) {
      fstream file;
      string term_name;
      string path_name = "./docs/" + file_name;
      vector<string> term_names;
      file.open( path_name, ios::in );
      while ( file.peek() != EOF ) {
        file >> term_name;
        doc_length++;
        term_name = add_index_term_to_dictionary( term_name, file_name );
        if ( term_name != "" ) term_names.push_back( term_name );
      } // while

      file.close();
      return term_names;
    } // read_a_doc_to_dictionary()

    string add_index_term_to_dictionary( string term_name, string file_name ) {
      auto iterator = doc_terms_map.find( term_name );
      if ( iterator != doc_terms_map.end() ) {
        Dictionary doc_term = iterator -> second;
        doc_term.term_property.tf[num_file]++;
        if ( doc_term.term_in_last_file_name != file_name ) {
          doc_term.term_property.num_of_article_including++;
          doc_term.term_in_last_file_name = file_name;
        } // if
        else {
          term_name = "";
        } // else

        iterator -> second = doc_term;
      } // if
      else {
        TermProperty term_property;
        term_property.tf[num_file] = 1;
        term_property.num_of_article_including = 1;
        Dictionary doc_term;
        doc_term.term_name = term_name;
        doc_term.term_in_last_file_name = file_name;
        doc_term.term_property = term_property;
        doc_terms_map.insert( make_pair( term_name, doc_term ));
      } // else

      return term_name;
    } // add_index_term_to_dictionary()


}; // Doc_Scanner

int main() {
  cout << "Start scanning documents.\n";
  Doc_Scanner doc_scanner;
  doc_scanner.ReadAllDocuments();
  doc_scanner.CalculateDocsTermWeight();
  cout << "Scanning documents done.\n";
  cout << "Start Calulate Query Weight\n";
  // PrintVector( doc_terms );
  doc_scanner.CalculateEveryQueryFile();
  cout << "End Calulate Query Weight\n";
  ofstream output_file( "output_csv.csv" );
  doc_scanner.RankEveryQueryFile( output_file );
  cout << "End Program\n";
} // main()

HW3 - Mean Average Precision (MAP)

Code

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287


# include <fstream>
# include <string>
# include <vector>
# include <iostream>
# include <algorithm>
# include <io.h>
# include <cmath>
# include <unordered_map>
using namespace std;

struct TermProperty {
  int tf[14954] = { 0 };
  int num_of_article_including;
  double idf;
  double weight;
}; // TermProperty

# define MAX_NUM_DOCS 14955.000

struct Dictionary {
  string term_name;
  string term_in_last_file_name;
  TermProperty term_property;
  bool operator==(const Dictionary& p) {
    return this->term_name == p.term_name;
  } // bool

  inline friend std::ostream& operator<<(std::ostream& os, Dictionary& p) {
    os << p.term_name;
    return os;
  } 

}; // Dictionary

struct Document {
  string doc_name;
  float doc_length;
  vector<string> all_term_names_in_file;
  vector<double> all_term_tf_in_file;
  vector<double> all_term_ni_in_file;
  vector<double> all_term_weight_in_file;
  double ranking;
}; // Document

struct Query {
  string file_name;
  vector<int> tf;
  vector<string> term_names;
  vector<double> term_weights;
}; // Query

struct compare_key {

    inline bool operator() (const Document& struct1, const Document& struct2) {
      if ( struct1.ranking == struct2.ranking && struct1.doc_name > struct2.doc_name ) {
        return 1;
      } // if
      else if ( struct1.ranking == struct2.ranking && struct1.doc_name < struct2.doc_name ) {
        return 0;
      } // else if

      return ( struct1.ranking > struct2.ranking );
    } // 
};

class Doc_Scanner {
  public:
    unordered_map<string, Dictionary> doc_terms_map;
    vector<Document> all_docs;
    double avg_doclen;
    int num_file;
    vector<Query> all_queries;
    
    void ReadAllDocuments() {
      string doc_files_path = "./docs/*.txt"; 
      struct _finddata_t file_info;
      int handle = _findfirst( doc_files_path.c_str(), &file_info );
      int k = handle;
      float doc_length = 0.0;
      avg_doclen = 0.0;
      if ( handle == -1 ) {
        cout << "Read Docs Error. Please check your docs path.\n";
      } // if
      else {
        num_file = 0;
        while ( k != -1 ) {
          string file_name = file_info.name;
          Document document;
          document.doc_name = file_name;
          document.ranking = -1;
          document.all_term_names_in_file = read_a_doc_to_dictionary( file_name, doc_length );
          document.doc_length = doc_length;
          avg_doclen = avg_doclen + doc_length;
          all_docs.push_back( document );
          num_file++;
          k = _findnext( handle, &file_info);
          doc_length = 0.0;
        } // while
      } // else

      avg_doclen = avg_doclen / MAX_NUM_DOCS;
      _findclose( handle );
  } // ReadAllDocuments()

    void CalculateDocsTermWeight() {
      for ( int i = 0; i < all_docs.size(); i++ ) {
        for ( int j = 0; j < all_docs[i].all_term_names_in_file.size(); j++ ) {
          auto iterator = doc_terms_map.find( all_docs[i].all_term_names_in_file[j] );
          Dictionary doc_term = iterator -> second; 
          doc_term.term_property.idf = MAX_NUM_DOCS / doc_term.term_property.num_of_article_including;
          doc_term.term_property.weight = ( doc_term.term_property.tf[i] ) * log10( doc_term.term_property.idf );
          all_docs[i].all_term_tf_in_file.push_back( doc_term.term_property.tf[i] );
          all_docs[i].all_term_ni_in_file.push_back( doc_term.term_property.num_of_article_including );
          all_docs[i].all_term_weight_in_file.push_back( doc_term.term_property.weight );
          iterator -> second = doc_term;
        } // for

      } // for

    } // CalculateDocsTermWeight()

    void CalculateEveryQueryFile() {
      string query_files_path = "./queries/*.txt"; 
      struct _finddata_t file_info;
      int handle = _findfirst( query_files_path.c_str(), &file_info );
      int k = handle;
      while ( k != -1 ) {
        string file_name = file_info.name;
        calculate_query_term_weight( file_name );
        k = _findnext( handle, &file_info );
      } // while

    } // RankEveryQuery()

    void RankEveryQueryFile( ofstream & output_file ) {
      output_file << "Query,RetrievedDocuments\n";
      k1 = 1.4;
      b = 0.5;
      q = 0.4;
      k3 = 1000;
      for( int i = 0; i < all_queries.size(); i++ ) {
        query_file = all_queries[i];
        for( int j = 0; j < all_docs.size(); j++ ) {
          all_docs[j].ranking = rank_doc_BM25( all_docs[j] );
        } // for

        sort( all_docs.begin(), all_docs.end(), compare_key() );
        output_ranking( output_file );
      } // for

    } // RankEveryQueryFile()


  private:
    Query query_file;
    vector<double> query_weights;
    vector<double> doc_weights;
    double k1, b, k3, q;
    
    void calculate_query_term_weight( string query_file_name ) {
      fstream file;
      string term_name;
      string path_name = "./queries/" + query_file_name;
      Query query;
      double weight;
      query.file_name = query_file_name;
      file.open( path_name, ios::in );
      while ( file.peek() != EOF ) {
        file >> term_name;
        auto iterator = doc_terms_map.find( term_name );
        if ( iterator != doc_terms_map.end() ) {
          Dictionary doc_term = iterator -> second;
          weight = 2.5 * log10( doc_term.term_property.idf );
          query.term_weights.push_back( weight );
          query.term_names.push_back( term_name );
        } // if

      } // while

      file.close();
      all_queries.push_back( query );
    } // calculate_query_term_weight()

    double rank_doc_BM25( Document doc ) {
      double doc_score = 0.0, doc_temp = 0.0;
      for( int i = 0; i < query_file.term_names.size(); i++ ) {
        auto it = find( doc.all_term_names_in_file.begin(), doc.all_term_names_in_file.end(), query_file.term_names[i] );
        if ( it != doc.all_term_names_in_file.end() ) { 
          int pos = it - doc.all_term_names_in_file.begin();
          doc_temp = doc.all_term_tf_in_file[pos] /  ( ( 1 - b ) + b * ( doc.doc_length / avg_doclen ) );
          if ( doc_temp > 0 ) {
            doc_temp = ( ( k1 + 1.0 ) * ( doc_temp + q ) ) / ( k1 + doc_temp + q );
          } // if
          else {
            doc_temp = 0;
          } // else

          doc_score = doc_score + doc_temp;
          doc_score = doc_score * ( ( k3 + 1.0 ) * 1.0 / ( k3 + 1.0 ) );
          auto iterator = doc_terms_map.find( query_file.term_names[i] );
          Dictionary doc_term = iterator -> second; 
          doc_score = doc_score * log10( ( MAX_NUM_DOCS - doc_term.term_property.num_of_article_including + 0.5 ) / ( doc_term.term_property.num_of_article_including + 0.5 ) );
        } // if

        doc_score = doc_score + doc_temp;
        doc_temp = 0.0;
      } // for()
      
      return doc_score;
    } // rank_doc_BM25()

    void output_ranking( ofstream & output_file ) {
      string query_name = query_file.file_name;
      query_name.erase( query_name.end() -4, query_name.end() );
      output_file << query_name << ",";
      for( int i = 0; i < all_docs.size(); i++ ) {
        string term_name = all_docs[i].doc_name;
        term_name.erase( term_name.end() -4, term_name.end() );
        output_file << term_name << " ";
      } // for

      output_file << "\n";
    } // output_ranking()

    vector<string> read_a_doc_to_dictionary( string file_name, float & doc_length ) {
      fstream file;
      string term_name;
      string path_name = "./docs/" + file_name;
      vector<string> term_names;
      file.open( path_name, ios::in );
      while ( file.peek() != EOF ) {
        file >> term_name;
        doc_length++;
        term_name = add_index_term_to_dictionary( term_name, file_name );
        if ( term_name != "" ) term_names.push_back( term_name );
      } // while

      file.close();
      return term_names;
    } // read_a_doc_to_dictionary()

    string add_index_term_to_dictionary( string term_name, string file_name ) {
      auto iterator = doc_terms_map.find( term_name );
      if ( iterator != doc_terms_map.end() ) {
        Dictionary doc_term = iterator -> second;
        doc_term.term_property.tf[num_file]++;
        if ( doc_term.term_in_last_file_name != file_name ) {
          doc_term.term_property.num_of_article_including++;
          doc_term.term_in_last_file_name = file_name;
        } // if
        else {
          term_name = "";
        } // else

        iterator -> second = doc_term;
      } // if
      else {
        TermProperty term_property;
        term_property.tf[num_file] = 1;
        term_property.num_of_article_including = 1;
        Dictionary doc_term;
        doc_term.term_name = term_name;
        doc_term.term_in_last_file_name = file_name;
        doc_term.term_property = term_property;
        doc_terms_map.insert( make_pair( term_name, doc_term ));
      } // else

      return term_name;
    } // add_index_term_to_dictionary()


}; // Doc_Scanner

int main() {
  cout << "Start scanning documents.\n";
  Doc_Scanner doc_scanner;
  doc_scanner.ReadAllDocuments();
  doc_scanner.CalculateDocsTermWeight();
  cout << "Scanning documents done.\n";
  cout << "Start Calulate Query Weight\n";
  // PrintVector( doc_terms );
  doc_scanner.CalculateEveryQueryFile();
  cout << "End Calulate Query Weight\n";
  ofstream output_file( "output_csv.csv" );
  doc_scanner.RankEveryQueryFile( output_file );
  cout << "End Program\n";
} // main()

HW4 - PLSA Model

PLSA參數調法

Topic_num: 8 / a (第一項local TF機率term = 0.6) all_doc使用vector存local tf
b (第二項gobal TF機率term = 0.25) / Inter_run: 30 dictionary使用map存word與position、global TF存

PLSA理論心得

EM可求得隱含變量T(k)，加入了Pd_z[z][m]、Pw_z[z][w]讓其隨機初始值和為1。透過每個TOPIC下對於Pd_z[z][m] (每一份DOC(m)下是TOPIC(z)主題之機率)與 Pw_z[z][w] (每一份TOPIC(z)下term(w)之機率)，利用兩者獨立機率反覆對Pz_dw[z][m][position]進行更新。

PLSA實作心得

程式流程為scanning documetns➡ Init Parameter ( two for m-step、one for e-step) ➡ Run EM for 30 runs ➡ calculate for every query file and log prob

In step 1: 存取所有doc並記錄local、global TF到對應容器中

In step 2: C++需要new出memory space for these parameters.

In step 3: 每run完一次EM完，就會得到更新的Pd_z[z][m]、Pw_z[z][w] for m-step，利用這兩個又繼續更新Pz_dw[z][m][position] for e-step，訓練到接近收斂

In step 4: 與query file中term進行計算，得出每個doc對應query之機率並取log以大排到小輸出。

選用最小Topic(8)的情況底下跑了30 run，需要非常久的時間，雖然在map上已選用unordered_map最快的容器下去實作，但時間還是花得比想像中多，在猜想的原因是因為某些可以使用map下去做映射term index的部分，比起vector中的find速度會快上不少，這次的model比起以往需要更多的時間下去完成試驗，下次的作業應提早開始準備，這樣才有更多時間進行測試。

Code

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452


# include <fstream>
# include <string>
# include <vector>
# include <iostream>
# include <algorithm>
# include <io.h>
# include <cmath>
# include <unordered_map>
# include <cstdlib>
# include <stdlib.h>
# include <ctime>
using namespace std;

# define MAX_NUM_DOC 14955.0
# define MAX_NUM_QUERY 100.0
# define MAX_NUM_TOPIC 8.0
# define MAX_NUM_ITER 30.0
# define MAX_LENGTH 7059938.0
# define MAX_NUM_WORDS 111449.0

struct TermProperty {
  string term_name;
  double tf;
  double prob; // probability ( first or third term )
}; // TermProperty

struct Document {
  int docID;
  string doc_name;
  double ranking;
  double doc_length;
  vector<TermProperty> term_properties;
}; // Documentc

struct Word2doc {
  int docID;
  int position;
}; // Word2doc

struct Dictionary {
  TermProperty term_property;
  vector<Word2doc> word2doc;
  int word2dic;
  bool operator==(const Dictionary& p) {
    return this->term_property.term_name == p.term_property.term_name;
  } // bool

  inline friend std::ostream& operator<<(std::ostream& os, Dictionary& p) {
    os << p.term_property.term_name;
    return os;
  } 

}; // Dictionary

struct compare_key {

    inline bool operator() (const Document& struct1, const Document& struct2) {
      if ( struct1.ranking == struct2.ranking && struct1.doc_name > struct2.doc_name ) {
        return 1;
      } // if
      else if ( struct1.ranking == struct2.ranking && struct1.doc_name < struct2.doc_name ) {
        return 0;
      } // else if

      return ( struct1.ranking > struct2.ranking );
    } // 
};


class PLSA {
  public:
    unordered_map<string, Dictionary> dictionary;
    vector<Document> all_docs;
    vector<string> all_words;
    int num_file;
    int all_docs_len;
    
    void ReadAllDocuments() {
      all_docs_len = 0;
      string doc_files_path = "./docs/*.txt"; 
      struct _finddata_t file_info;
      int handle = _findfirst( doc_files_path.c_str(), &file_info );
      int k = handle;
      double doc_length = 0.0;
      if ( handle == -1 ) {
        cout << "Read Docs Error. Please check your docs path.\n";
      } // if
      else {
        num_file = 0;
        while ( k != -1 ) {
          string file_name = file_info.name;
          Document document;
          document.doc_name = file_name;
          document.ranking = -1;
          document.term_properties = read_a_doc_to_dictionary( file_name, doc_length );
          document.doc_length = doc_length;
          document.docID = num_file;
          all_docs_len += doc_length;
          all_docs.push_back( document );
          num_file++;
          doc_length = 0.0;
          k = _findnext( handle, &file_info);
        } // while

      } // else

      _findclose( handle );
      
  } // ReadAllDocuments()

    void Init() {
      num_data = MAX_NUM_DOC;
      num_words = MAX_NUM_WORDS;
      num_topics = MAX_NUM_TOPIC;
      srand(time(NULL));    //設置随機數種子，使每次獲取的随機序列不同。

      Pd_z  = new double*[num_topics-1];
      for ( int z = 0; z < num_topics; z++ ) {
        double norm_down = 0.0;
        Pd_z[z] = new double[num_data-1];
        for( int m = 0; m < num_data; m++ ) {
          Pd_z[z][m] = (rand()%100000)*0.00001;
          norm_down += Pd_z[z][m];
        } // for

        for( int m = 0; m < num_data; m++ ) {
          Pd_z[z][m] /= norm_down;
        } // for

      } // for

      cout << "Init step1 done\n";
      Pw_z  = new double*[num_topics-1];
      for ( int z = 0; z < num_topics; z++ ) {
        double norm_down = 0.0;
        Pw_z[z] = new double[num_words-1];
        for( int w = 0; w < num_words; w++ ) {
          Pw_z[z][w] = (rand()%100000)*0.00001;
          norm_down += Pw_z[z][w];                               
        } // for

        for( int w = 0; w < num_words; w++ ) {
          Pw_z[z][w] /= norm_down;
        } // for

      } // for

      cout << "Init step2 done\n";
      Pz_dw = new double **[num_topics-1];
      for ( int z = 0; z < num_topics; z++ ) {
        Pz_dw[z] = new double *[num_data-1];
        for ( int m = 0; m < num_data; m++ ) {
          int size = ( int ) all_docs[m].term_properties.size();
          Pz_dw[z][m] = new double[size-1];
        } // for()

      } // for()

      cout << "Init step3 done\n";

    } // Init()

    void DoPLSA() {
      // start training
      for ( int i = 0; i < MAX_NUM_ITER; i++ ) {
        Estep();
        cout << "iteration: Estep: " << i << "\n";
        Mstep_1();
        cout << "iteration: Mstep_1: " << i << "\n";
        Mstep_2();
        cout << "iteration: Mstep_2: " << i << "\n";   
        system( "pause" );     
      } // for()
      
    } // DoPLSA()

    void RankEveryQueryFile( ofstream & output_file ) {
      a = 0.6;
      b = 0.25;
      output_file << "Query,RetrievedDocuments\n";
      string query_files_path = "./queries/*.txt"; 
      struct _finddata_t file_info;
      int handle = _findfirst( query_files_path.c_str(), &file_info );
      int k = handle;
      while ( k != -1 ) {
        string file_name = file_info.name;
        calculate_query_prob( file_name );
        sort( all_docs.begin(), all_docs.end(), compare_key() );
        output_ranking( output_file, file_name );
        k = _findnext( handle, &file_info );
      } // while

    } // RankEveryQueryFile() 

  private:
    int num_data; // number of data
    int num_words; // number of words
    int num_topics; // number of topics
    double a, b;
    double **Pd_z = NULL;
    double **Pw_z = NULL;
    double ***Pz_dw = NULL;
    double **Dw_m = NULL;

    vector<TermProperty> read_a_doc_to_dictionary( string file_name, double & doc_length ) {
      fstream file;
      string term_name;
      string path_name = "./docs/" + file_name;
      vector<TermProperty> term_properties;
      TermProperty term_property;
      Word2doc word2doc;
      Dictionary doc_term;
      unordered_map<string, int> doc2dic;
      file.open( path_name, ios::in );
      int position = 0;
      while ( file.peek() != EOF ) {
        file >> term_name;
        doc_length++; 
        term_property.term_name = term_name; 
        auto it_dic = dictionary.find( term_name );
        if ( it_dic != dictionary.end() ) {
          doc_term = it_dic -> second;
          doc_term.term_property.tf++;
          doc_term.term_property.prob = doc_term.term_property.tf / MAX_LENGTH;
          if ( doc_term.word2doc[doc_term.word2doc.size()-1].docID != num_file ) {
            word2doc.docID = num_file;
            word2doc.position = position;
            doc_term.word2doc.push_back( word2doc );
            term_property.tf = 1;
            term_properties.push_back(term_property);
            doc2dic.insert(make_pair(term_name, position));
            position++;
          } // if
          else {
            auto it = doc2dic.find( term_name );
            int pos = it -> second;
            term_properties[pos].tf++;              
          } // else

          it_dic -> second = doc_term;
        } // if
        else {
          term_property.tf = 1;
          term_property.prob = term_property.tf / MAX_LENGTH;
          word2doc.docID = num_file;
          word2doc.position = position;
          doc_term.term_property = term_property;
          doc_term.word2doc.push_back( word2doc );
          doc_term.word2dic = all_words.size();
          all_words.push_back( term_name );
          dictionary.insert( make_pair( term_name, doc_term ));
          position++;
          term_properties.push_back(term_property);
          doc2dic.insert(make_pair(term_name, position));
        } // else

      } // while

      for (int i = 0; i < term_properties.size(); i++ ) {
        term_properties[i].prob = term_properties[i].tf / doc_length;
      } // for

      file.close();
      return term_properties;
    } // read_a_doc_to_dictionary()
    
    void Estep() {
      cout << all_docs.size() << " " << num_data << "\n";
      for ( int m = 0; m < num_data; m++ ) {
        for ( int position = 0; position < all_docs[m].term_properties.size(); position++ ) {
          double norm_down = 0.0;
          auto it_dic = dictionary.find( all_docs[m].term_properties[position].term_name );
          if ( it_dic == dictionary.end() ) cout << "cry\n";
          int w = it_dic -> second.word2dic;
          for ( int z = 0; z < num_topics; z++ ) {
            double val = Pd_z[z][m] * Pw_z[z][w];
            Pz_dw[z][m][position] = val;
            norm_down += val;
          } // for()

          if ( norm_down != 0.0 ) {
            for ( int z = 0; z < num_topics; z++ ) {
              Pz_dw[z][m][position] /= norm_down;
            } // for

          } // if
          else {
            for ( int z = 0; z < num_topics; z++ ) {
              Pz_dw[z][m][position] = 0.0;
            } // for

          } // else

          
        } // for()

        if ( m >= 14950 ) {
          cout << all_docs[m].doc_name << " " << all_docs[m].term_properties.size() << " qq1\n";
        } // if

      } // for
    
    } // Estep()

    void Mstep_1() {
      // Pd_z[z][m]
      for ( int z = 0; z < num_topics; z++ ) {
        double norm_down = 0.0;
        for ( int m = 0; m < num_data; m++ ) {
          double sum = 0.0;
          for ( int position = 0; position < all_docs[m].term_properties.size(); position++ ) {
            auto it = dictionary.find( all_docs[m].term_properties[position].term_name );
            sum += it -> second.term_property.tf * Pz_dw[z][m][position];
          } // for

          Pd_z[z][m] = sum;
          norm_down += sum;
        } // for()

        if ( norm_down != 0 ) {
          for ( int m = 0; m < num_data; m++) {
            Pd_z[z][m] /= norm_down;
          } // for

        } // if
        else {
          for ( int m = 0; m < num_data; m++ ) {
            Pd_z[z][m] = 0;
          } // for

        } // else

      } // for()

    } // Mstep_1()

    void Mstep_2() {
      // Pw_z[z][w]
      for ( int z = 0; z < num_topics; z++ ) {
        double norm_down = 0.0;
        for ( int w = 0; w < num_words; w++ ) {
          double sum = 0.0;
          string word = all_words[w];
          auto it = dictionary.find( word );
          vector<Word2doc> word2docs = it -> second.word2doc;
          for ( int i = 0; i < word2docs.size(); i++ ) {
            int m = word2docs[i].docID;
            int position = word2docs[i].position;
            auto it_2 = dictionary.find( all_docs[m].term_properties[position].term_name );
            sum += it_2 -> second.term_property.tf * Pz_dw[z][m][position];
          } // for
          
          Pw_z[z][w] = sum;
          norm_down += sum;
        } // for

        if ( norm_down != 0 ) {
          for ( int w = 0; w < num_words; w++) {
            Pw_z[z][w] /= norm_down;
          } // for

        } // if
        else {
          for ( int w = 0; w < num_words; w++) {
            Pw_z[z][w] = 0;
          } // for

        } // else

      } // for

    } // Mstep_2()

    void calculate_query_prob( string query_file_name ) {
      fstream file;
      string term_name;
      string path_name = "./queries/" + query_file_name;
      file.open( path_name, ios::in );
      vector<string> all_query_term;
      while ( file.peek() != EOF ) {
        file >> term_name;
        all_query_term.push_back( term_name );
      } // while

      for ( int i = 0; i < all_docs.size(); i++ ) {
        double prob = 0.0;
        for ( int j = 0; j < all_query_term.size(); j++ ) {
          prob = prob * calculate_term_prob( all_docs[i], all_query_term[j] );
          if ( prob == 0.0 ) break;
        } // for

        if ( prob != 0.0 ) prob = log10( prob );
        all_docs[i].ranking = prob;
      } // for

    } // calculate_query_prob()

    double calculate_term_prob( Document doc, string query_term ) {
      double prob = 0.0, temp_prob = 0.0;
      auto it_dic = dictionary.find( query_term );
      if ( it_dic == dictionary.end() ) return prob;

      int m = doc.docID;
      int w = it_dic -> second.word2dic;
      vector<TermProperty> term_properties = doc.term_properties;
      TermProperty term_property;
      // first term
      auto it_doc = find_if( term_properties.begin(), term_properties.end(), [query_term] (const TermProperty& s) { return s.term_name == query_term; } );
      if ( it_doc != term_properties.end() ) {
        term_property = *it_doc; 
        prob += term_property.prob * a;
      } // if

      // second term and third term
      for ( int z = 0; z < num_topics; z++ ) {
        temp_prob += Pd_z[z][m] * Pw_z[z][w] ;
      } // for

      prob += temp_prob * b;
      prob += ( 1.0 - a - b ) * it_dic -> second.term_property.prob;
      return prob;
    }  // calculate_term_prob()

    void output_ranking( ofstream & output_file, string query_file ) {
      query_file.erase( query_file.end() -4, query_file.end() );
      output_file << query_file << ",";
      for( int i = 0; i < all_docs.size(); i++ ) {
        string term_name = all_docs[i].doc_name;
        term_name.erase( term_name.end() -4, term_name.end() );
        output_file << term_name << " ";
      } // for

      output_file << "\n";
    } // output_ranking()

}; // PLSA

int main() {
  cout << "Start scanning documents.\n";
  PLSA plsa;
  plsa.ReadAllDocuments();
  system("pause");
  cout << "Scanning documents done.\n";
  cout << "Start Initing.\n";
  plsa.Init(); 
  cout << "End Initing.\n";
  plsa.DoPLSA();
  cout << "Done Training.\n";
  ofstream output_file( "output_csv.csv" );
  plsa.RankEveryQueryFile( output_file );
  cout << "End Program\n";  
} // main()

HW5 - Query Modeling (Rocchio Algorithm)

Query Modeling (Rocchio Algorithm)參數調法

First Information Retrieval: BM25 k1= 0.8 b = 0.7 k3 = 100.0
Second Information Retrieval: VSM
Rocchio algorithm: a = 2.5 b = 0.4 relevant doc = 10 non-relevant doc = 1 (c = 0.1)
如果First Information Retrieval使用PLSA能獲得較高的分數，但需要的memory極大。

Query Modeling (Rocchio Algorithm)理論心得

由於query所蒐集的資料量太少，導致資料檢索前所獲得的資訊太少，而透過使用Pseudo-Relevance Feedback的過程得到加強後的query term，再進行第二次資料檢索，能獲得較好的結果。Query Modeling利用此核心精神下去實作，而本次實作Pseudo-Relevance Feedback的方法為Rocchio Algorithm。

Query Modeling (Rocchio Algorithm)實作心得

程式流程為scanning documents ➡ calculate every word TF-IDF ➡Run BM25/PLSA for First IR ➡ using Rocchio for updating new query weight ➡ Run VSM for Second IR

In step 1、2: 存取所有doc並記錄local、global TF到對應容器中

In step 3: 計算BM25並排序前10篇文章為relevant docs

In step 4: update query vector on relevant docs , normalize new query weight and add original query weight ( a = 2.5, b =0.4 ) if using non-relevant doc ( c =0.1)
In step 5: normalize every doc weight and do VSM separately.

在實作過程中發現Relevant Doc篇數的設定也會有很大的影響, 不一定多比較好, 太少也可能不行，10~15篇是獲得此資料集較理想的Doc數。而參數b不能設得太大否則會讓new query weight跟original query weight的權重變得模糊，效果不好，最後在統一vector長度時，當分子其中一項為0時可以不算來增加運行速度。

Code

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287


# include <fstream>
# include <string>
# include <vector>
# include <iostream>
# include <algorithm>
# include <io.h>
# include <cmath>
# include <unordered_map>
using namespace std;

struct TermProperty {
  int tf[14954] = { 0 };
  int num_of_article_including;
  double idf;
  double weight;
}; // TermProperty

# define MAX_NUM_DOCS 14955.000

struct Dictionary {
  string term_name;
  string term_in_last_file_name;
  TermProperty term_property;
  bool operator==(const Dictionary& p) {
    return this->term_name == p.term_name;
  } // bool

  inline friend std::ostream& operator<<(std::ostream& os, Dictionary& p) {
    os << p.term_name;
    return os;
  } 

}; // Dictionary

struct Document {
  string doc_name;
  float doc_length;
  vector<string> all_term_names_in_file;
  vector<double> all_term_tf_in_file;
  vector<double> all_term_ni_in_file;
  vector<double> all_term_weight_in_file;
  double ranking;
}; // Document

struct Query {
  string file_name;
  vector<int> tf;
  vector<string> term_names;
  vector<double> term_weights;
}; // Query

struct compare_key {

    inline bool operator() (const Document& struct1, const Document& struct2) {
      if ( struct1.ranking == struct2.ranking && struct1.doc_name > struct2.doc_name ) {
        return 1;
      } // if
      else if ( struct1.ranking == struct2.ranking && struct1.doc_name < struct2.doc_name ) {
        return 0;
      } // else if

      return ( struct1.ranking > struct2.ranking );
    } // 
};

class Doc_Scanner {
  public:
    unordered_map<string, Dictionary> doc_terms_map;
    vector<Document> all_docs;
    double avg_doclen;
    int num_file;
    vector<Query> all_queries;
    
    void ReadAllDocuments() {
      string doc_files_path = "./docs/*.txt"; 
      struct _finddata_t file_info;
      int handle = _findfirst( doc_files_path.c_str(), &file_info );
      int k = handle;
      float doc_length = 0.0;
      avg_doclen = 0.0;
      if ( handle == -1 ) {
        cout << "Read Docs Error. Please check your docs path.\n";
      } // if
      else {
        num_file = 0;
        while ( k != -1 ) {
          string file_name = file_info.name;
          Document document;
          document.doc_name = file_name;
          document.ranking = -1;
          document.all_term_names_in_file = read_a_doc_to_dictionary( file_name, doc_length );
          document.doc_length = doc_length;
          avg_doclen = avg_doclen + doc_length;
          all_docs.push_back( document );
          num_file++;
          k = _findnext( handle, &file_info);
          doc_length = 0.0;
        } // while
      } // else

      avg_doclen = avg_doclen / MAX_NUM_DOCS;
      _findclose( handle );
  } // ReadAllDocuments()

    void CalculateDocsTermWeight() {
      for ( int i = 0; i < all_docs.size(); i++ ) {
        for ( int j = 0; j < all_docs[i].all_term_names_in_file.size(); j++ ) {
          auto iterator = doc_terms_map.find( all_docs[i].all_term_names_in_file[j] );
          Dictionary doc_term = iterator -> second; 
          doc_term.term_property.idf = MAX_NUM_DOCS / doc_term.term_property.num_of_article_including;
          doc_term.term_property.weight = ( doc_term.term_property.tf[i] ) * log10( doc_term.term_property.idf );
          all_docs[i].all_term_tf_in_file.push_back( doc_term.term_property.tf[i] );
          all_docs[i].all_term_ni_in_file.push_back( doc_term.term_property.num_of_article_including );
          all_docs[i].all_term_weight_in_file.push_back( doc_term.term_property.weight );
          iterator -> second = doc_term;
        } // for

      } // for

    } // CalculateDocsTermWeight()

    void CalculateEveryQueryFile() {
      string query_files_path = "./queries/*.txt"; 
      struct _finddata_t file_info;
      int handle = _findfirst( query_files_path.c_str(), &file_info );
      int k = handle;
      while ( k != -1 ) {
        string file_name = file_info.name;
        calculate_query_term_weight( file_name );
        k = _findnext( handle, &file_info );
      } // while

    } // RankEveryQuery()

    void RankEveryQueryFile( ofstream & output_file ) {
      output_file << "Query,RetrievedDocuments\n";
      k1 = 1.4;
      b = 0.5;
      q = 0.4;
      k3 = 1000;
      for( int i = 0; i < all_queries.size(); i++ ) {
        query_file = all_queries[i];
        for( int j = 0; j < all_docs.size(); j++ ) {
          all_docs[j].ranking = rank_doc_BM25( all_docs[j] );
        } // for

        sort( all_docs.begin(), all_docs.end(), compare_key() );
        output_ranking( output_file );
      } // for

    } // RankEveryQueryFile()


  private:
    Query query_file;
    vector<double> query_weights;
    vector<double> doc_weights;
    double k1, b, k3, q;
    
    void calculate_query_term_weight( string query_file_name ) {
      fstream file;
      string term_name;
      string path_name = "./queries/" + query_file_name;
      Query query;
      double weight;
      query.file_name = query_file_name;
      file.open( path_name, ios::in );
      while ( file.peek() != EOF ) {
        file >> term_name;
        auto iterator = doc_terms_map.find( term_name );
        if ( iterator != doc_terms_map.end() ) {
          Dictionary doc_term = iterator -> second;
          weight = 2.5 * log10( doc_term.term_property.idf );
          query.term_weights.push_back( weight );
          query.term_names.push_back( term_name );
        } // if

      } // while

      file.close();
      all_queries.push_back( query );
    } // calculate_query_term_weight()

    double rank_doc_BM25( Document doc ) {
      double doc_score = 0.0, doc_temp = 0.0;
      for( int i = 0; i < query_file.term_names.size(); i++ ) {
        auto it = find( doc.all_term_names_in_file.begin(), doc.all_term_names_in_file.end(), query_file.term_names[i] );
        if ( it != doc.all_term_names_in_file.end() ) { 
          int pos = it - doc.all_term_names_in_file.begin();
          doc_temp = doc.all_term_tf_in_file[pos] /  ( ( 1 - b ) + b * ( doc.doc_length / avg_doclen ) );
          if ( doc_temp > 0 ) {
            doc_temp = ( ( k1 + 1.0 ) * ( doc_temp + q ) ) / ( k1 + doc_temp + q );
          } // if
          else {
            doc_temp = 0;
          } // else

          doc_score = doc_score + doc_temp;
          doc_score = doc_score * ( ( k3 + 1.0 ) * 1.0 / ( k3 + 1.0 ) );
          auto iterator = doc_terms_map.find( query_file.term_names[i] );
          Dictionary doc_term = iterator -> second; 
          doc_score = doc_score * log10( ( MAX_NUM_DOCS - doc_term.term_property.num_of_article_including + 0.5 ) / ( doc_term.term_property.num_of_article_including + 0.5 ) );
        } // if

        doc_score = doc_score + doc_temp;
        doc_temp = 0.0;
      } // for()
      
      return doc_score;
    } // rank_doc_BM25()

    void output_ranking( ofstream & output_file ) {
      string query_name = query_file.file_name;
      query_name.erase( query_name.end() -4, query_name.end() );
      output_file << query_name << ",";
      for( int i = 0; i < all_docs.size(); i++ ) {
        string term_name = all_docs[i].doc_name;
        term_name.erase( term_name.end() -4, term_name.end() );
        output_file << term_name << " ";
      } // for

      output_file << "\n";
    } // output_ranking()

    vector<string> read_a_doc_to_dictionary( string file_name, float & doc_length ) {
      fstream file;
      string term_name;
      string path_name = "./docs/" + file_name;
      vector<string> term_names;
      file.open( path_name, ios::in );
      while ( file.peek() != EOF ) {
        file >> term_name;
        doc_length++;
        term_name = add_index_term_to_dictionary( term_name, file_name );
        if ( term_name != "" ) term_names.push_back( term_name );
      } // while

      file.close();
      return term_names;
    } // read_a_doc_to_dictionary()

    string add_index_term_to_dictionary( string term_name, string file_name ) {
      auto iterator = doc_terms_map.find( term_name );
      if ( iterator != doc_terms_map.end() ) {
        Dictionary doc_term = iterator -> second;
        doc_term.term_property.tf[num_file]++;
        if ( doc_term.term_in_last_file_name != file_name ) {
          doc_term.term_property.num_of_article_including++;
          doc_term.term_in_last_file_name = file_name;
        } // if
        else {
          term_name = "";
        } // else

        iterator -> second = doc_term;
      } // if
      else {
        TermProperty term_property;
        term_property.tf[num_file] = 1;
        term_property.num_of_article_including = 1;
        Dictionary doc_term;
        doc_term.term_name = term_name;
        doc_term.term_in_last_file_name = file_name;
        doc_term.term_property = term_property;
        doc_terms_map.insert( make_pair( term_name, doc_term ));
      } // else

      return term_name;
    } // add_index_term_to_dictionary()


}; // Doc_Scanner

int main() {
  cout << "Start scanning documents.\n";
  Doc_Scanner doc_scanner;
  doc_scanner.ReadAllDocuments();
  doc_scanner.CalculateDocsTermWeight();
  cout << "Scanning documents done.\n";
  cout << "Start Calulate Query Weight\n";
  // PrintVector( doc_terms );
  doc_scanner.CalculateEveryQueryFile();
  cout << "End Calulate Query Weight\n";
  ofstream output_file( "output_csv.csv" );
  doc_scanner.RankEveryQueryFile( output_file );
  cout << "End Program\n";
} // main()

HW6 - BERT Model

BERT 參數調法

BERT 理論心得

BERT(雙向Transformer編碼表達)
主要是以兩種預訓練的方式來建立語言模型。

讓model分辨句⼦之間上下⽂之關係

簡而知BERT預訓練模型需要做兩個主要任務:

第⼀Task: 判斷兩句話是否真的相鄰(Binary classification)
第⼆Task: 預測被mask之單詞(Multi-class classification)

(三) BERT實作心得

使用助教提供的baseline code下去實現，其主要呼叫的地方可以查官方API實作出來，利用此model回傳loss及logits值。

而TO-DO地方中需要回傳BM25的2D array scores，可以利用助教的方法實現

Code

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528


#!/usr/bin/env python
# coding: utf-8

# In[ ]:


from time import time
from datetime import timedelta
from copy import deepcopy

import random
import numpy as np
import pandas as pd
from ml_metrics import mapk

import torch
from torch.optim import AdamW
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForMultipleChoice

# Random seed
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# CUDA device
use_cuda_device = 0
torch.cuda.set_device(use_cuda_device)
print("Using CUDA device: %d" % torch.cuda.current_device())


# ## Settings

# In[ ]:


# Input files
document_csv_path = '../input/ntust-ir2020-homework6/documents.csv'
training_csv_path = '../input/ntust-ir2020-homework6/train_queries.csv'
testing_csv_path = '../input/ntust-ir2020-homework6/test_queries.csv'

# Input limitation
max_query_length = 64
max_input_length = 512
num_negatives = 3   # num. of negative documents to pair with a positive document

# Model finetuning
model_name_or_path = "bert-base-uncased"
max_epochs = 1
learning_rate = 3e-5
dev_set_ratio = 0.2   # make a ratio of training set as development set for rescoring weight sniffing
max_patience = 0      # earlystop if avg. loss on development set doesn't decrease for num. of epochs
batch_size = 2    # num. of inputs = 8 requires ~9200 MB VRAM (num. of inputs = batch_size * (num_negatives + 1))
num_workers = 2   # num. of jobs for pytorch dataloader

# Save paths
save_model_path = "models/bert_base_uncased"  # assign `None` for not saving the model
save_submission_path = "bm25_bert_rescoring.csv"
K = 1000   # for MAP@K


# ## Preparing

# In[ ]:


# Build and save BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_name_or_path)
if save_model_path is not None:
    save_tokenizer_path = "%s/tokenizer" % (save_model_path)
    tokenizer.save_pretrained(save_tokenizer_path)

# Collect mapping of all document id and text
doc_id_to_text = {}
doc_df = pd.read_csv(document_csv_path)
doc_df.fillna("<Empty Document>", inplace=True)
id_text_pair = zip(doc_df["doc_id"], doc_df["doc_text"])
for i, pair in enumerate(id_text_pair, start=1):
    doc_id, doc_text = pair
    doc_id_to_text[doc_id] = doc_text
    
    print("Progress: %d/%d\r" % (i, len(doc_df)), end='')
    
doc_df.tail()


# # Training

# ## Split a ratio of training set as development set

# In[ ]:


train_df = pd.read_csv(training_csv_path)
dev_df, train_df = np.split(train_df, [int(dev_set_ratio*len(train_df))])
dev_df.reset_index(drop=True, inplace=True)
train_df.reset_index(drop=True, inplace=True)

print("train_df shape:", train_df.shape)
print("dev_df shape:", dev_df.shape)
train_df.tail()


# ## Build instances for training/development set

# In[ ]:


get_ipython().run_cell_magic('time', '', 'doc_id_to_token_ids = {}\n\n\ndef preprocess_df(df):\n    \'\'\' Preprocess DataFrame into training instances for BERT. \'\'\'\n    instances = []\n    \n    # Parse CSV\n    for i, row in df.iterrows():\n        query_id, query_text, pos_doc_ids, bm25_top1000, _ = row\n        pos_doc_id_list = pos_doc_ids.split()\n        pos_doc_id_set = set(pos_doc_id_list)\n        bm25_top1000_list = bm25_top1000.split()\n        bm25_top1000_set = set(bm25_top1000_list)\n\n        # Pair BM25 neg. with pos. samples\n        labeled_pos_neg_list = []\n        for pos_doc_id in pos_doc_id_list:\n            neg_doc_id_set = bm25_top1000_set - pos_doc_id_set\n            neg_doc_ids = random.sample(neg_doc_id_set, num_negatives)\n            pos_position = random.randint(0, num_negatives)\n            pos_neg_doc_ids = neg_doc_ids\n            pos_neg_doc_ids.insert(pos_position, pos_doc_id)\n            labeled_sample = (pos_neg_doc_ids, pos_position)\n            labeled_pos_neg_list.append(labeled_sample)\n            \n        # Make query tokens for BERT\n        query_tokens = tokenizer.tokenize(query_text)\n        if len(query_tokens) > max_query_length:  # truncation\n            query_tokens = query_tokens[:max_query_length]\n        query_token_ids = tokenizer.convert_tokens_to_ids(query_tokens)\n        query_token_ids.insert(0, tokenizer.cls_token_id)\n        query_token_ids.append(tokenizer.sep_token_id)\n\n        # Make input instances for all query/doc pairs\n        for doc_ids, label in labeled_pos_neg_list:\n            paired_input_ids = []\n            paired_attention_mask = []\n            paired_token_type_ids = []\n            \n            # Merge all pos/neg inputs as a single sample\n            for doc_id in doc_ids:\n                if doc_id in doc_id_to_token_ids:\n                    doc_token_ids = doc_id_to_token_ids[doc_id]\n                else:\n                    doc_text = doc_id_to_text[doc_id]\n                    doc_tokens = tokenizer.tokenize(doc_text)\n                    doc_token_ids = tokenizer.convert_tokens_to_ids(doc_tokens)\n                    doc_id_to_token_ids[doc_id] = doc_token_ids\n                doc_token_ids.append(tokenizer.sep_token_id)\n\n                # make input sequences for BERT\n                input_ids = query_token_ids + doc_token_ids\n                token_type_ids = [0 for token_id in query_token_ids]\n                token_type_ids.extend(1 for token_id in doc_token_ids)\n                if len(input_ids) > max_input_length:  # truncation\n                    input_ids = input_ids[:max_input_length]\n                    token_type_ids = token_type_ids[:max_input_length]\n                attention_mask = [1 for token_id in input_ids]\n                \n                # convert and collect inputs as tensors\n                input_ids = torch.LongTensor(input_ids)\n                attention_mask = torch.FloatTensor(attention_mask)\n                token_type_ids = torch.LongTensor(token_type_ids)\n                paired_input_ids.append(input_ids)\n                paired_attention_mask.append(attention_mask)\n                paired_token_type_ids.append(token_type_ids)\n            label = torch.LongTensor([label]).squeeze()\n            \n            # Pre-pad tensor pairs for efficiency\n            paired_input_ids = pad_sequence(paired_input_ids, batch_first=True)\n            paired_attention_mask = pad_sequence(paired_attention_mask, batch_first=True)\n            paired_token_type_ids = pad_sequence(paired_token_type_ids, batch_first=True)\n\n            # collect all inputs as a dictionary\n            instance = {}\n            instance[\'input_ids\'] = paired_input_ids.T  # transpose for code efficiency\n            instance[\'attention_mask\'] = paired_attention_mask.T\n            instance[\'token_type_ids\'] = paired_token_type_ids.T\n            instance[\'label\'] = label\n            instances.append(instance)\n\n        print("Progress: %d/%d\\r" % (i+1, len(df)), end=\'\')\n    print()\n    return instances\n\ntrain_instances = preprocess_df(train_df)\ndev_instances = preprocess_df(dev_df)\n\nprint("num. train_instances: %d" % len(train_instances))\nprint("num. dev_instances: %d" % len(dev_instances))\nprint("input_ids.T shape:", train_instances[0][\'input_ids\'].T.shape)\ntrain_instances[0][\'input_ids\'].T\n')


# ## Build dataset and dataloader for PyTorch

# In[ ]:


class TrainingDataset(Dataset):
    def __init__(self, instances):
        self.instances = instances
    
    def __len__(self):
        return len(self.instances)
        
    def __getitem__(self, i):
        instance = self.instances[i]
        input_ids = instance['input_ids']
        attention_mask = instance['attention_mask']
        token_type_ids = instance['token_type_ids']
        label = instance['label']
        return input_ids, attention_mask, token_type_ids, label
    
def get_train_dataloader(instances, batch_size=2, num_workers=4):
    def collate_fn(batch):
        input_ids, attention_mask, token_type_ids, labels = zip(*batch)
        input_ids = pad_sequence(input_ids, batch_first=True).transpose(1,2).contiguous()  # re-transpose
        attention_mask = pad_sequence(attention_mask, batch_first=True).transpose(1,2).contiguous()
        token_type_ids = pad_sequence(token_type_ids, batch_first=True).transpose(1,2).contiguous()
        labels = torch.stack(labels)
        return input_ids, attention_mask, token_type_ids, labels
    
    dataset = TrainingDataset(instances)
    dataloader = DataLoader(dataset, collate_fn=collate_fn, shuffle=True, \
                            batch_size=batch_size, num_workers=num_workers)
    return dataloader

# Demo
dataloader = get_train_dataloader(train_instances)
for batch in dataloader:
    input_ids, attention_mask, token_type_ids, labels = batch
    break
    
print(input_ids.shape)
input_ids


# ## Initialize and finetune BERT

# In[ ]:


model = BertForMultipleChoice.from_pretrained(model_name_or_path)
model.cuda()

optimizer = AdamW(model.parameters(), lr=learning_rate)
optimizer.zero_grad()


# ### (TO-DO!) Define validation function for earlystopping

# In[ ]:


def validate(model, instances):
    total_loss = 0
    model.eval()
    dataloader = get_train_dataloader(instances, batch_size=batch_size, num_workers=num_workers)
    for batch in dataloader:
        batch = (tensor.cuda() for tensor in batch)
        input_ids, attention_mask, token_type_ids, labels = batch
        
        ''' TO-DO: 
        1. Compute the cross-entropy loss (using built-in loss of BertForMultipleChoice)
          (Hint: You need to call a function of model which takes all the 4 tensors in the batch as inputs)
          
        2. Sum up the loss of all dev-set samples
          (Hint: The built-in loss is averaged, so you should multiply it with the batch size)
        '''
        with torch.no_grad():
            ### 1. insert_missing_code 
            loss = model( input_ids = input_ids, attention_mask = attention_mask,     
                          token_type_ids = token_type_ids, labels = labels, return_dict=1 ).loss          
        ### 2. insert_missing_code 
        total_loss += loss * batch_size 
        
    avg_loss = total_loss / len(instances)
    return avg_loss


# ### (TO-DO!) Let's train this beeg boy ;-)

# In[ ]:


patience, best_dev_loss = 0, 1e10
best_state_dict = model.state_dict()

start_time = time()
dataloader = get_train_dataloader(train_instances, batch_size=batch_size, num_workers=num_workers)
for epoch in range(1, max_epochs+1):
    model.train()
    for i, batch in enumerate(dataloader, start=1):
        batch = (tensor.cuda() for tensor in batch)
        input_ids, attention_mask, token_type_ids, labels = batch
        
        # Backpropogation
        ''' TO-DO: 
        1. Compute the cross-entropy loss (using built-in loss of BertForMultipleChoice)
          (Hint: You need to call a function of model which takes all the 4 tensors in the batch as inputs)
         
        2. Perform backpropogation on the loss (i.e. compute gradients)
        3. Optimize the model.
          (Hint: These two lines of codes can be found in PyTorch tutorial)
        '''
        ### 1. insert_missing_code
        loss = model( input_ids = input_ids, attention_mask = attention_mask,     
                      token_type_ids = token_type_ids, labels = labels, return_dict=1 ).loss  

        ### 2. insert_missing_code
        loss.backward()
        ### 3. insert_missing_code
        optimizer.step()

        optimizer.zero_grad()
        
        # Progress bar with timer ;-)
        elapsed_time = time() - start_time
        elapsed_time = timedelta(seconds=int(elapsed_time))
        print("Epoch: %d/%d | Batch: %d/%d | loss=%.5f | %s      \r" \
              % (epoch, max_epochs, i, len(dataloader), loss, elapsed_time), end='')
        
    # Save parameters of each epoch
    if save_model_path is not None:
        save_checkpoint_path = "%s/epoch_%d" % (save_model_path, epoch)
        model.save_pretrained(save_checkpoint_path)
        
    # Get avg. loss on development set
    print("Epoch: %d/%d | Validating...                           \r" % (epoch, max_epochs), end='')
    dev_loss = validate(model, dev_instances)
    elapsed_time = time() - start_time
    elapsed_time = timedelta(seconds=int(elapsed_time))
    print("Epoch: %d/%d | dev_loss=%.5f | %s                      " \
          % (epoch, max_epochs, dev_loss, elapsed_time))
    
    # Track best checkpoint and earlystop patience
    if dev_loss < best_dev_loss:
        patience = 0
        best_dev_loss = dev_loss
        best_state_dict = deepcopy(model.state_dict())
        if save_model_path is not None:
            model.save_pretrained(save_model_path)
    else:
        patience += 1
    
    if patience > max_patience:
        print('Earlystop at epoch %d' % epoch)
        break
        
# Restore parameters with best loss on development set
model.load_state_dict(best_state_dict)


# # Testing

# In[ ]:


class TestingDataset(Dataset):
    def __init__(self, instances):
        self.instances = instances
    
    def __len__(self):
        return len(self.instances)
        
    def __getitem__(self, i):
        instance = self.instances[i]
        input_ids = instance['input_ids']
        attention_mask = instance['attention_mask']
        token_type_ids = instance['token_type_ids']
        input_ids = torch.LongTensor(input_ids)
        attention_mask = torch.FloatTensor(attention_mask)
        token_type_ids = torch.LongTensor(token_type_ids)
        return input_ids, attention_mask, token_type_ids, 
    
def get_test_dataloader(instances, batch_size=8, num_workers=4):
    def collate_fn(batch):
        input_ids, attention_mask, token_type_ids = zip(*batch)
        input_ids = pad_sequence(input_ids, batch_first=True).unsqueeze(1)  # predict as single choice
        attention_mask = pad_sequence(attention_mask, batch_first=True).unsqueeze(1)
        token_type_ids = pad_sequence(token_type_ids, batch_first=True).unsqueeze(1)
        return input_ids, attention_mask, token_type_ids
    
    dataset = TestingDataset(instances)
    dataloader = DataLoader(dataset, collate_fn=collate_fn, shuffle=False, \
                            batch_size=batch_size, num_workers=num_workers)
    return dataloader


# ## (TO-DO!) Define function to predict BERT scores

# In[ ]:


def predict_query_doc_scores(model, df):
    model.eval()
    start_time = time()

    # Parse CSV
    query_id_list = df["query_id"]
    query_text_list = df["query_text"]
    bm25_top1000_list = df["bm25_top1000"]

    # Treat {1 query, K documents} as a dataset for prediction
    query_doc_scores = []
    query_doc_ids = []
    rows = zip(query_id_list, query_text_list, bm25_top1000_list)
    for qi, row in enumerate(rows, start=1):
        query_id, query_text, bm25_top1000 = row
        bm25_doc_id_list = bm25_top1000.split()
        query_doc_ids.append(bm25_doc_id_list)

        #################################################
        #    Collect all instances of query/doc pairs
        #################################################
        query_instances = []

        # Make query tokens for BERT
        query_tokens = tokenizer.tokenize(query_text)
        if len(query_tokens) > max_query_length:  # truncation
            query_tokens = query_tokens[:max_query_length]
        query_token_ids = tokenizer.convert_tokens_to_ids(query_tokens)
        query_token_ids.insert(0, tokenizer.cls_token_id)
        query_token_ids.append(tokenizer.sep_token_id)

        # Make input instances for all query/doc pairs
        for i, doc_id in enumerate(bm25_doc_id_list, start=1):
            if doc_id in doc_id_to_token_ids:
                doc_token_ids = doc_id_to_token_ids[doc_id]
            else:
                doc_text = doc_id_to_text[doc_id]
                doc_tokens = tokenizer.tokenize(doc_text)
                doc_token_ids = tokenizer.convert_tokens_to_ids(doc_tokens)
                doc_id_to_token_ids[doc_id] = doc_token_ids
            doc_token_ids.append(tokenizer.sep_token_id)

            # make input sequences for BERT
            input_ids = query_token_ids + doc_token_ids
            token_type_ids = [0 for token_id in query_token_ids]
            token_type_ids.extend(1 for token_id in doc_token_ids)
            if len(input_ids) > max_input_length:  # truncation
                input_ids = input_ids[:max_input_length]
                token_type_ids = token_type_ids[:max_input_length]
            attention_mask = [1 for token_id in input_ids]

            # convert and collect inputs as tensors
            input_ids = torch.LongTensor(input_ids)
            attention_mask = torch.FloatTensor(attention_mask)
            token_type_ids = torch.LongTensor(token_type_ids)


            # collect all inputs as a dictionary
            instance = {}
            instance['input_ids'] = input_ids
            instance['attention_mask'] = attention_mask
            instance['token_type_ids'] = token_type_ids
            query_instances.append(instance)

        #################################################################
        #    Predict relevance scores for all BM25-top-1000 documents
        #################################################################
        doc_scores = np.empty((0,1))

        # Predict scores for each document
        dataloader = get_test_dataloader(query_instances, batch_size=batch_size*(num_negatives+1), num_workers=num_workers)
        for di, batch in enumerate(dataloader, start=1):
            batch = (tensor.cuda() for tensor in batch)
            input_ids, attention_mask, token_type_ids = batch
            
            ''' TO-DO: 
            1. Compute the logits as relevance scores (using the same function of how you compute built-in loss)
              (Hint: You need to call a function of model which takes all the 3 tensors in the batch as inputs)
         
            2. The scores are still on GPU. Reallocate them on CPU, and convert into numpy arrays.
              (Hint: You need to call two functions on the `scores` tensors. You can find them in PyTorch tutorial.)
            '''
            with torch.no_grad():
                ### 1. insert_missing_code_to_compute_logits ###
                scores = model( input_ids=input_ids, attention_mask=attention_mask,     
                                token_type_ids=token_type_ids, return_dict=True ).logits  


            # merge all scores into a big numpy array
            ### step 2.  insert_missing_function_1()###.###insert_missing_function_2()
            scores = scores.cpu().numpy() 
            doc_scores = np.vstack((doc_scores, scores)) ## 新增new row

            # Progress bar with timer ;-)
            elapsed_time = time() - start_time
            elapsed_time = timedelta(seconds=int(elapsed_time))
            print("Query: %d/%d | Progress: %d/%d | %s      \r" \
                  % (qi, len(df), di, len(dataloader), elapsed_time), end='')

        # merge all query/BM25 document pair scores
        query_doc_scores.append(doc_scores)
    query_doc_scores = np.hstack(query_doc_scores).T

    print()
    return query_doc_scores, query_doc_ids


# In[ ]:





# ## (TO-DO!) Find best weight of BERT for BM25 rescoring on training set

# In[ ]:


dev_query_doc_scores, dev_query_doc_ids = predict_query_doc_scores(model, dev_df)

print('---- Grid search weight for "BM25 + weight * BERT" ----')
best_map_score, best_bert_weight = -100, 0.0
bert_scores = dev_query_doc_scores
n_query = dev_query_doc_scores.shape[0]

# Get MAP@K of BM25 baseline
query_pos_doc_ids = dev_df['pos_doc_ids'].values.tolist()
actual = [doc_ids.split() for doc_ids in query_pos_doc_ids]
bm25_predicted = [doc_id_list[:K] for doc_id_list in dev_query_doc_ids]
map_score = mapk(actual, bm25_predicted, k=K)
best_map_score = map_score
print("weight=%.1f: %.5f  (BM25 baseline)" % (0, 100*map_score))

# Collect BM25 scores into same format of BERT scores
''' TO-DO: 
1. Convert the BM25 top-1000 scores into 2d numpy arrays
2. BM25 scores should have the same shape and orders as `dev_query_doc_scores` (i.e. BERT scores)
  (Hint: If there are 24 dev-set queries, the shape should be (24, 1000) )
'''

### 2. insert_whatever_you_want_to_meet_the_requirement_in_step2.
bm25_scores = [scores.split() for scores in dev_df["bm25_top1000_scores"]] 
bm25_scores = [[float(score) for score in scores] for scores in bm25_scores]
bm25_scores = np.array(bm25_scores)


# Grid search for BM25 + BERT rescoring
low_bound, high_bound, scale = 0, 5, 1000
grids = [i / scale for i in range(low_bound * scale+1, high_bound * scale+1)]
for weight in grids:
    
    ''' TO-DO: 
    1. Compute the weighted scores using `bm25_scores`, `weight`, and `bert_scores`
    '''
    weighted_scores = bm25_scores + weight * bert_scores ### 1. insert_missing_code ###
    
    # sort index and map to document ids as output
    rescore_argsort = np.flip(weighted_scores.argsort(), axis=1)
    predicted = []
    for i in range(n_query):  # num. of queries
        predicted.append([dev_query_doc_ids[i][idx] for idx in rescore_argsort[i]][:K])
    map_score = mapk(actual, predicted, k=K)
    
    # show part of results for human evaluation
    if weight * 10 % 2 == 0:
        print("weight=%.1f: %.5f" % (weight, 100*map_score))
        
    # track weight with best MAP@10
    if map_score > best_map_score:
        best_map_score = map_score
        best_bert_weight = weight
print("\nHighest MAP@%d = %.5f found at weight=%.3f" % (K, 100*best_map_score, best_bert_weight))


# ## (TO-DO!) Rescore testing set with BERT for submission

# In[ ]:


# Predict BERT scores for testing set
test_df = pd.read_csv(testing_csv_path)
query_id_list = test_df["query_id"]
n_query = len(query_id_list)
test_query_doc_scores, test_query_doc_ids = predict_query_doc_scores(model, test_df)
bert_scores = test_query_doc_scores


# In[ ]:


# Rescore query/document score with BM25 + BERT
bm25_scores = [scores.split() for scores in test_df["bm25_top1000_scores"]]  # parse into 2d list of string
bm25_scores = [[float(score) for score in scores] for scores in bm25_scores]  # convert to float
bm25_scores = np.array(bm25_scores)

''' TO-DO: 
1. Compute the weighted scores using `bm25_scores`, `best_bert_weight`, and `bert_scores`
'''
### 1. insesrt_missing_code ###
weighted_scores = bm25_scores + best_bert_weight * bert_scores

# Rerank document ids with new scores
rescore_argsort = np.flip(weighted_scores.argsort(), axis=1)
ranked_doc_id_list = []
for i in range(n_query):  # num. of queries
    ranked_doc_id_list.append([test_query_doc_ids[i][idx] for idx in rescore_argsort[i]][:K])
ranked_doc_ids = [' '.join(doc_id_list) for doc_id_list in ranked_doc_id_list]

# Save reranked results for submission
data = {'query_id': query_id_list, 'ranked_doc_ids': ranked_doc_ids}
submission_df = pd.DataFrame(data)
submission_df.reset_index(drop=True, inplace=True)
submission_df.to_csv(save_submission_path, index=False)
print("Saved submission file as `%s`" % save_submission_path)

HW1 - Vector Space Model#

Code#

HW2 - Best Match Model#

Code#

HW3 - Mean Average Precision (MAP)#

Code#

HW4 - PLSA Model#

Code#

HW5 - Query Modeling (Rocchio Algorithm)#

Code#

HW6 - BERT Model#

Code#

HW1 - Vector Space Model

Code

HW2 - Best Match Model

Code

HW3 - Mean Average Precision (MAP)

Code

HW4 - PLSA Model

Code

HW5 - Query Modeling (Rocchio Algorithm)

Code

HW6 - BERT Model

Code