import pandas as pd# https://zenodo.org/record/2783642q_df = pd.read_csv("https://zenodo.org/record/2783642/files/Questions.csv")a_df = pd.read_csv("https://zenodo.org/record/2783642/files/Answers.csv")print("q_df.shape:", q_df.shape)print("a_df.shape:", a_df.shape)q_df.columns = [c.strip() for c in q_df.columns]a_df.columns = [c.strip() for c in a_df.columns]df = q_df.merge(a_df, on="AID")df.columns = ["query","AID","document"]metadata = a_df[["AID"]].to_dict(orient="records")documents = a_df["Text"].tolist()query_list =list(zip(q_df["Text"], q_df["AID"]))display(q_df.head(3))display(a_df.head(3))
def weighted_reciprocal_rank_fusion(rank_lists, weights, k=-1):""" Perform weighted Reciprocal Rank Fusion on multiple rank lists. Args: rank_lists (list of lists): A list of rank lists, where each rank list contains unique items. weights (list of float): A list of weights corresponding to the rank lists. k (float, optional): A constant added to the rank, controlling the balance between the importance of high-ranked items and the consideration given to lower-ranked items. Default is 0. Returns: list: The final aggregated list of items sorted by their weighted RRF scores in descending order. """if k ==-1: k =0.5*len(rank_lists[0])iflen(rank_lists) !=len(weights):raiseValueError("Number of rank lists must be equal to the number of weights.") rrf_scores = {}for rank_list, weight inzip(rank_lists, weights):for rank, item inenumerate(rank_list, start=1): rrf_score = weight * (1/ (rank + k))if item in rrf_scores: rrf_scores[item] += rrf_scoreelse: rrf_scores[item] = rrf_score# Sort items by their RRF scores in descending order sorted_items =sorted(rrf_scores.keys(), key=lambda x: rrf_scores[x], reverse=True)return sorted_items