Solution officielle
import re
import math
from collections import Counter, defaultdict
def tokeniser(texte):
mots = re.findall(r'[a-zA-ZÀ-ÿ0-9]+', texte.lower())
return [mot for mot in mots if len(mot) > 1]
def calculer_tf(document):
mots = tokeniser(document)
if not mots:
return {}
total = len(mots)
compteur = Counter(mots)
return {mot: count / total for mot, count in compteur.items()}
def calculer_idf(documents):
N = len(documents)
if N == 0:
return {}
df = defaultdict(int)
for doc in documents:
for mot in set(tokeniser(doc)):
df[mot] += 1
return {mot: math.log(N / freq) for mot, freq in df.items()}
def scorer_document(document, requête, idf):
tf = calculer_tf(document)
tfidf = {mot: tf_val * idf.get(mot, 0) for mot, tf_val in tf.items()}
return sum(tfidf.get(mot, 0) for mot in tokeniser(requête))
def classer_resultats(documents, requête):
idf = calculer_idf(documents)
scores = []
for i, doc in enumerate(documents):
score = scorer_document(doc, requête, idf)
if score > 0:
scores.append((i, score))
return sorted(scores, key=lambda x: x[1], reverse=True)