Result filtering by relevance and RAG with chucnking logic

2026-05-17 15:27:21 -04:00
parent 9d6d4ec160
commit 2ed6a0aae9
1 changed files with 86 additions and 32 deletions
@@ -1,4 +1,5 @@
-import json, os, logging, base64, time, hashlib, re, http.client, ssl, concurrent.futures, threading
+import json, os, logging, base64, time, hashlib, re, http.client, ssl, concurrent.futures, threading, math
 from collections import Counter
 from urllib.parse import urlparse
 from searx import network
 try:
@@ -55,6 +56,44 @@ def _get_streaming_connection(url: str, verify_ssl: bool = True):
    return conn, path
 def _tokenize(text: str) -> list:
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    return [t for t in text.split() if len(t) > 2]
 def _tfidf_score(query_tokens: list, doc_tokens: list) -> float:
    if not doc_tokens or not query_tokens:
        return 0.0
    doc_len = len(doc_tokens)
    doc_counter = Counter(doc_tokens)
    k1 = 1.5
    b = 0.75
    avg_len = 150
    score = 0.0
    for qt in query_tokens:
        tf = doc_counter.get(qt, 0) / doc_len
        idf = 1.0 / (1.0 + doc_counter.get(qt, 0) / max(doc_len, 1))
        tf_bm25 = (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * doc_len / avg_len))
        score += tf_bm25 * math.log(1 + idf)
    return score
 def _chunk_text(text: str, chunk_size: int = 512, overlap: int = 64) -> list:
    tokens = _tokenize(text)
    if len(tokens) <= chunk_size:
        return [text]
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + chunk_size, len(tokens))
        chunks.append(' '.join(tokens[start:end]))
        if end >= len(tokens):
            break
        start += chunk_size - overlap
    return chunks
 _VALKEY_POOL = None
@@ -1378,7 +1417,7 @@ class SXNGPlugin(Plugin):
                    if res.status != 200:
                        return ''
-                    html = res.read(512 * 1024).decode('utf-8', errors='replace')
+                    html = res.read(256 * 1024).decode('utf-8', errors='replace')
                finally:
                    conn.close()
@@ -1393,55 +1432,70 @@ class SXNGPlugin(Plugin):
                text = re.sub(r'\s+', ' ', text).strip()
                logger.debug(f"{PLUGIN_NAME}: fetched {len(text)} chars from {url}")
-                return text[:2000]
+                return text[:6000]
            return ''
        except Exception:
            return ''
    def _enrich_results(self, clean_results: list, query: str) -> list:
-        enrich_count = min(3, self.context_deep_count)
+        query_tokens = _tokenize(query)
        enrich_count = min(5, self.context_deep_count + 2)
        for r in clean_results:
            r['fetched_content'] = ''
            r['relevance_score'] = 0.0
        futures_map: dict = {}
-        with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
            for r in clean_results[:enrich_count]:
                futures_map[executor.submit(self._fetch_page_text, r.get('url', ''))] = r
            for future, r in futures_map.items():
                try:
-                    text = future.result(timeout=6)
+                    text = future.result(timeout=4)
-                    if text and len(text) > 100:
+                    if not text or len(text) < 100:
-                        words = query.lower().split()
+                        snippet = r.get('content', '')
-                        text_lower = text.lower()
+                        if snippet:
-                        best_pos = len(text) // 2
+                            r['relevance_score'] = _tfidf_score(query_tokens, _tokenize(snippet))
-                        best_count = -1
+                        continue
-                        keyword_positions = []
+                    chunks = _chunk_text(text, chunk_size=512, overlap=64)
-                        for word in words:
+                    best_chunk = ''
-                            start = 0
+                    best_score = -1.0
-                            while True:
+                    for chunk in chunks:
-                                idx = text_lower.find(word, start)
+                        score = _tfidf_score(query_tokens, _tokenize(chunk))
-                                if idx == -1:
+                        if score > best_score:
-                                    break
+                            best_score = score
-                                keyword_positions.append(idx)
+                            best_chunk = chunk
                                start = idx + 1
-                        for pos in (keyword_positions or [best_pos]):
+                    r['fetched_content'] = best_chunk[:800]
-                            window_start = max(0, pos - 400)
+                    r['relevance_score'] = best_score
-                            window_end = min(len(text), pos + 400)
+                    logger.debug(
-                            count = sum(w in text_lower[window_start:window_end] for w in words)
+                        f"{PLUGIN_NAME}: [{r.get('url', '')}] "
-                            if count > best_count:
+                        f"score={best_score:.4f} chunks={len(chunks)}"
-                                best_count = count
+                    )
-                                best_pos = pos
+                except Exception as e:
                    logger.debug(f"{PLUGIN_NAME}: enrich failed for {r.get('url', '')}: {e}")
-                        start = max(0, best_pos - 400)
+        enriched = [r for r in clean_results[:enrich_count] if r.get('relevance_score', 0) > 0]
-                        r['fetched_content'] = text[start:start + 800]
+        not_enriched = clean_results[enrich_count:]
-                except Exception:
+        enriched.sort(key=lambda r: r['relevance_score'], reverse=True)
-                    pass
+        reranked = enriched + not_enriched
-        return clean_results
+        seen_urls = {r.get('url') for r in reranked}
        for r in clean_results:
            if r.get('url') not in seen_urls:
                reranked.append(r)
                seen_urls.add(r.get('url'))
        if enriched:
            logger.debug(
                f"{PLUGIN_NAME}: reranked {len(enriched)} results, "
                f"top score={enriched[0]['relevance_score']:.4f}"
            )
        return reranked
    def _assemble_context(self, clean_results, infoboxes, answers, offset=0) -> tuple[str, list]:
        """Builds context string from normalized search data. Returns (context_str, urls)."""