mirror of
				https://github.com/ArthurIdema/Zoekeend-Phrase-Indexing.git
				synced 2025-10-26 16:24:21 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			133 lines
		
	
	
		
			5.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			133 lines
		
	
	
		
			5.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import pathlib
 | |
| import os
 | |
| 
 | |
| import ir_datasets
 | |
| 
 | |
| 
 | |
| class ir_dataset_test:
 | |
|     class Doc:
 | |
|         def __init__(self, doc_id, text):
 | |
|             self.doc_id = doc_id
 | |
|             self.text = text
 | |
|     class Query:
 | |
|         def __init__(self, query_id, text):
 | |
|             self.query_id = query_id
 | |
|             self.text = text
 | |
|     class Qrel:
 | |
|         def __init__(self, query_id, doc_id, relevance):
 | |
|             self.query_id = query_id
 | |
|             self.doc_id = doc_id
 | |
|             self.relevance = relevance
 | |
| 
 | |
|     # Custom documents
 | |
|     # Custom documents
 | |
|     doc1  = Doc('d1', 'Custom document one about information retrieval.')
 | |
|     doc2  = Doc('d2', 'Custom document two about machine learning.')
 | |
|     doc3  = Doc('d3', 'Custom document three about artificial intelligence.')
 | |
|     doc4  = Doc('d4', 'Custom-document FOUR about INFORMATION-RETRIEVAL and its applications.')
 | |
|     doc5  = Doc('d5', 'Another custom document, artificial intelligence with punctuation! And special characters like @#$%.')
 | |
|     doc6  = Doc('d6', 'Machine-learning is artificial amazing; it combines AI, data-science, and more.')
 | |
|     doc7  = Doc('d7', 'Information retrieval is the backbone of search engines and academic research.')
 | |
|     doc8  = Doc('d8', 'Machine learning has become a core part of artificial intelligence.')
 | |
|     doc9  = Doc('d9', 'Artificial intelligence artificial kip saté and machine learning are fields with significant overlap.')
 | |
|     doc10 = Doc('d10', 'Machine learning is a subfield of artificial intelligence focused on data.')
 | |
|     doc11 = Doc('d11', 'The process of information retrieval includes indexing and ranking documents.')
 | |
|     doc12 = Doc('d12', 'Many AI systems rely on both machine learning and information retrieval.')
 | |
|     doc13 = Doc('d13', 'Artificial intelligence kip saté is widely used in natural language processing and robotics.')
 | |
|     doc14 = Doc('d14', 'Information retrieval systems are essential for finding relevant documents.')
 | |
|     doc15 = Doc('d15', 'Machine learning algorithms adapt based on data patterns.')
 | |
|     doc16 = Doc('d16', 'Artificial intelligence kip saté applications range from games to healthcare.')
 | |
|     doc17 = Doc('d17', 'Information retrieval helps systems return relevant search results.')
 | |
|     doc18 = Doc('d18', 'Machine learning and artificial intelligence are driving modern technology.')
 | |
|     doc19 = Doc('d19', 'Artificial intelligence is often combined with information retrieval to build smart assistants.')
 | |
|     doc20 = Doc('d20', 'The in the over at on Advanced machine learning techniques artificial intelligence are part of the artificial intelligence stack.')
 | |
| 
 | |
|     docs  = [doc1,  doc2,  doc3,  doc4,  doc5,  doc6,  doc7,  doc8,  doc9,  doc10,
 | |
|              doc11, doc12, doc13, doc14, doc15, doc16, doc17, doc18, doc19, doc20]
 | |
| 
 | |
|     # Custom queries
 | |
|     query1 = Query('1', 'information retrieval')
 | |
|     query2 = Query('2', 'machine learning')
 | |
|     query3 = Query('3', 'artificial intelligence')
 | |
|     queries = [query1, query2, query3]
 | |
| 
 | |
|     # Custom relevance judgments
 | |
|     qrel1 = Qrel('1', 'd1', 2)
 | |
|     qrel2 = Qrel('2', 'd2', 1)
 | |
|     qrel3 = Qrel('3', 'd3', 1)
 | |
|     qrels = [qrel1, qrel2, qrel3]
 | |
| 
 | |
|     def docs_count(self):
 | |
|         return len(self.docs)
 | |
| 
 | |
|     def docs_iter(self):
 | |
|         return self.docs
 | |
| 
 | |
|     def queries_iter(self):
 | |
|         return self.queries
 | |
| 
 | |
|     def qrels_iter(self):
 | |
|         return self.qrels
 | |
| 
 | |
| 
 | |
| def file_exists(name_in):
 | |
|     return pathlib.Path(name_in).is_file()
 | |
| 
 | |
| 
 | |
| def get_qrels(experiment):
 | |
|     if experiment == "custom":
 | |
|         from ze_eval import ir_dataset_test
 | |
|         qrel_file = "custom.qrels"
 | |
|         if not pathlib.Path(qrel_file).is_file():
 | |
|             with open(qrel_file, 'w') as file:
 | |
|                 for q in ir_dataset_test().qrels_iter():
 | |
|                     line = q.query_id + ' Q0 ' + q.doc_id + " " + str(q.relevance)
 | |
|                     file.write(line + '\n')
 | |
|         return qrel_file
 | |
|     if pathlib.Path(experiment).is_file(): # provide a qrels file directly...
 | |
|         return experiment
 | |
|     ir_dataset = ir_datasets.load(experiment) # ... or an ir_dataset
 | |
|     ir_dataset_qrels = ir_dataset.qrels_iter()
 | |
|     qrel_file = experiment + '.qrels'
 | |
|     qrel_file = qrel_file.replace('/', '_')
 | |
|     if not pathlib.Path(qrel_file).is_file():
 | |
|         with open(qrel_file, 'w') as file:
 | |
|             for q in ir_dataset_qrels:
 | |
|                 line = q.query_id + ' Q0 ' + q.doc_id + " " + str(q.relevance)
 | |
|                 file.write(line + '\n')
 | |
|     return qrel_file
 | |
| 
 | |
| def trec_eval(run_name, experiment, complete_rel=False,
 | |
|         ndcg=False, query_eval=False):
 | |
|     qrel_file = get_qrels(experiment)
 | |
|     switches = '-m official'
 | |
|     if ndcg:
 | |
|         switches += ' -m ndcg_cut'
 | |
|     if complete_rel:
 | |
|         switches += ' -c'
 | |
|     if query_eval:
 | |
|         switches += ' -q'
 | |
|     command = f"trec_eval {switches} {qrel_file} {run_name}"
 | |
|     print(command)
 | |
|     os.system(command)
 | |
|     # After running trec_eval, compute and print average postings cost if available in run file
 | |
|     try:
 | |
|         with open(run_name, 'r') as f:
 | |
|             postings_costs = {}
 | |
|             for line in f:
 | |
|                 parts = line.strip().split()
 | |
|                 if len(parts) >= 7:
 | |
|                     query_id = parts[0]
 | |
|                     try:
 | |
|                         cost = float(parts[6])
 | |
|                         if query_id not in postings_costs:
 | |
|                             postings_costs[query_id] = cost
 | |
|                     except Exception:
 | |
|                         continue
 | |
|             if postings_costs:
 | |
|                 avg_cost = sum(postings_costs.values()) / len(postings_costs)
 | |
|                 print(f"Average cost in postings: {avg_cost:.4f}")
 | |
|                 print(f"Total postings cost: {sum(postings_costs.values()):.4f}")
 | |
|     except Exception:
 | |
|         pass
 | 
