Result filtering by relevance and RAG with chucnking logic

This commit is contained in:
Tyler
2026-05-17 15:27:21 -04:00
parent 9d6d4ec160
commit 2ed6a0aae9
+86 -32
View File
@@ -1,4 +1,5 @@
import json, os, logging, base64, time, hashlib, re, http.client, ssl, concurrent.futures, threading import json, os, logging, base64, time, hashlib, re, http.client, ssl, concurrent.futures, threading, math
from collections import Counter
from urllib.parse import urlparse from urllib.parse import urlparse
from searx import network from searx import network
try: try:
@@ -55,6 +56,44 @@ def _get_streaming_connection(url: str, verify_ssl: bool = True):
return conn, path return conn, path
def _tokenize(text: str) -> list:
text = text.lower()
text = re.sub(r'[^\w\s]', ' ', text)
return [t for t in text.split() if len(t) > 2]
def _tfidf_score(query_tokens: list, doc_tokens: list) -> float:
if not doc_tokens or not query_tokens:
return 0.0
doc_len = len(doc_tokens)
doc_counter = Counter(doc_tokens)
k1 = 1.5
b = 0.75
avg_len = 150
score = 0.0
for qt in query_tokens:
tf = doc_counter.get(qt, 0) / doc_len
idf = 1.0 / (1.0 + doc_counter.get(qt, 0) / max(doc_len, 1))
tf_bm25 = (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * doc_len / avg_len))
score += tf_bm25 * math.log(1 + idf)
return score
def _chunk_text(text: str, chunk_size: int = 512, overlap: int = 64) -> list:
tokens = _tokenize(text)
if len(tokens) <= chunk_size:
return [text]
chunks = []
start = 0
while start < len(tokens):
end = min(start + chunk_size, len(tokens))
chunks.append(' '.join(tokens[start:end]))
if end >= len(tokens):
break
start += chunk_size - overlap
return chunks
_VALKEY_POOL = None _VALKEY_POOL = None
@@ -1378,7 +1417,7 @@ class SXNGPlugin(Plugin):
if res.status != 200: if res.status != 200:
return '' return ''
html = res.read(512 * 1024).decode('utf-8', errors='replace') html = res.read(256 * 1024).decode('utf-8', errors='replace')
finally: finally:
conn.close() conn.close()
@@ -1393,55 +1432,70 @@ class SXNGPlugin(Plugin):
text = re.sub(r'\s+', ' ', text).strip() text = re.sub(r'\s+', ' ', text).strip()
logger.debug(f"{PLUGIN_NAME}: fetched {len(text)} chars from {url}") logger.debug(f"{PLUGIN_NAME}: fetched {len(text)} chars from {url}")
return text[:2000] return text[:6000]
return '' return ''
except Exception: except Exception:
return '' return ''
def _enrich_results(self, clean_results: list, query: str) -> list: def _enrich_results(self, clean_results: list, query: str) -> list:
enrich_count = min(3, self.context_deep_count) query_tokens = _tokenize(query)
enrich_count = min(5, self.context_deep_count + 2)
for r in clean_results: for r in clean_results:
r['fetched_content'] = '' r['fetched_content'] = ''
r['relevance_score'] = 0.0
futures_map: dict = {} futures_map: dict = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
for r in clean_results[:enrich_count]: for r in clean_results[:enrich_count]:
futures_map[executor.submit(self._fetch_page_text, r.get('url', ''))] = r futures_map[executor.submit(self._fetch_page_text, r.get('url', ''))] = r
for future, r in futures_map.items(): for future, r in futures_map.items():
try: try:
text = future.result(timeout=6) text = future.result(timeout=4)
if text and len(text) > 100: if not text or len(text) < 100:
words = query.lower().split() snippet = r.get('content', '')
text_lower = text.lower() if snippet:
best_pos = len(text) // 2 r['relevance_score'] = _tfidf_score(query_tokens, _tokenize(snippet))
best_count = -1 continue
keyword_positions = [] chunks = _chunk_text(text, chunk_size=512, overlap=64)
for word in words: best_chunk = ''
start = 0 best_score = -1.0
while True: for chunk in chunks:
idx = text_lower.find(word, start) score = _tfidf_score(query_tokens, _tokenize(chunk))
if idx == -1: if score > best_score:
break best_score = score
keyword_positions.append(idx) best_chunk = chunk
start = idx + 1
for pos in (keyword_positions or [best_pos]): r['fetched_content'] = best_chunk[:800]
window_start = max(0, pos - 400) r['relevance_score'] = best_score
window_end = min(len(text), pos + 400) logger.debug(
count = sum(w in text_lower[window_start:window_end] for w in words) f"{PLUGIN_NAME}: [{r.get('url', '')}] "
if count > best_count: f"score={best_score:.4f} chunks={len(chunks)}"
best_count = count )
best_pos = pos except Exception as e:
logger.debug(f"{PLUGIN_NAME}: enrich failed for {r.get('url', '')}: {e}")
start = max(0, best_pos - 400) enriched = [r for r in clean_results[:enrich_count] if r.get('relevance_score', 0) > 0]
r['fetched_content'] = text[start:start + 800] not_enriched = clean_results[enrich_count:]
except Exception: enriched.sort(key=lambda r: r['relevance_score'], reverse=True)
pass reranked = enriched + not_enriched
return clean_results seen_urls = {r.get('url') for r in reranked}
for r in clean_results:
if r.get('url') not in seen_urls:
reranked.append(r)
seen_urls.add(r.get('url'))
if enriched:
logger.debug(
f"{PLUGIN_NAME}: reranked {len(enriched)} results, "
f"top score={enriched[0]['relevance_score']:.4f}"
)
return reranked
def _assemble_context(self, clean_results, infoboxes, answers, offset=0) -> tuple[str, list]: def _assemble_context(self, clean_results, infoboxes, answers, offset=0) -> tuple[str, list]:
"""Builds context string from normalized search data. Returns (context_str, urls).""" """Builds context string from normalized search data. Returns (context_str, urls)."""