merge context gathering methods

2026-01-20 22:34:36 -06:00
parent 25053109e2
commit 4e567ce4e2
2 changed files with 46 additions and 61 deletions
@@ -526,54 +526,25 @@ class SXNGPlugin(Plugin):
                'publishedDate': r.get('publishedDate', '')
            })
        # SearXNG already merges infoboxes by ID - take first with full content
        infoboxes = []
-        for ib in raw_infoboxes[:2]:
+        for ib in raw_infoboxes[:1]:
            infoboxes.append({
                'name': ib.get('infobox', '') or ib.get('title', ''),
-                'content': ib.get('content', '')[:400],
+                'content': ib.get('content', '')[:2000],
-                'attributes': ib.get('attributes', [])[:3]
+                'attributes': ib.get('attributes', [])
            })
-        answers = [a.get('answer', '') for a in raw_answers[:2] 
+        # Only extract simple Answer types (skip Translations, WeatherAnswer etc.)
-                   if a.get('answer') and not str(a.get('answer')).startswith('<')]
+        answers = []
        for a in list(raw_answers)[:2]:
            if hasattr(a, 'answer') and isinstance(getattr(a, 'answer', None), str):
                answers.append(a.answer)
            elif isinstance(a, dict) and a.get('answer'):
                answers.append(str(a['answer']))
        return results, infoboxes, answers
    def _format_aux_context_string(self, results, infoboxes, answers, offset):
        sections = []
        aux_urls = []
        kg_lines = []
        for ib in infoboxes:
            if ib.get('name'):
                content = ib.get('content', '')
                kg_lines.append(f"INFOBOX [{ib['name']}]: {content}")
        for a in answers:
            if a:
                kg_lines.append(f"ANSWER: {a}")
        if kg_lines:
            sections.append('KNOWLEDGE GRAPH:\n' + '\n'.join(kg_lines))
        source_lines = []
        for i, r in enumerate(results):
            url = r.get('url', '')
            aux_urls.append(url)
            # Match JS logic: domain extraction
            domain = urlparse(url).netloc.replace('www.', '')
            date = f" ({r['publishedDate']})" if r.get('publishedDate') else ''
            title = r.get('title', '')
            content = r.get('content', '')[:600]
            # [index] domain(date): title: content
            idx = i + 1 + offset
            source_lines.append(f"[{idx}] {domain}{date}: {title}: {content}")
        if source_lines:
            sections.append('SOURCES:\n' + '\n'.join(source_lines))
        return "\n\n".join(sections), aux_urls
    def init(self, app):
@@ -625,7 +596,7 @@ class SXNGPlugin(Plugin):
                results, infoboxes, answers = self._parse_aux_results(raw_results, raw_infoboxes, raw_answers)
-                context_str, new_urls = self._format_aux_context_string(results, infoboxes, answers, offset)
+                context_str, new_urls = self._assemble_context(results, infoboxes, answers, offset)
                return jsonify({
                    'context': context_str,
@@ -663,7 +634,7 @@ class SXNGPlugin(Plugin):
                        search_data.get('answers', [])
                    )
-                    context_str, new_urls = self._format_aux_context_string(results, infoboxes, answers, offset)
+                    context_str, new_urls = self._assemble_context(results, infoboxes, answers, offset)
                    return jsonify({
                        'context': context_str,
@@ -858,19 +829,22 @@ class SXNGPlugin(Plugin):
            })
        return True
-    def _assemble_context(self, search, raw_results) -> str:
+    def _assemble_context(self, raw_results, infoboxes, answers, offset=0) -> tuple[str, list]:
-        """Builds three-tier context string from search results."""
+        """Builds context string from normalized search data. Returns (context_str, urls)."""
        context_parts = []
        result_urls = []
        # Knowledge graph
        knowledge_graph_lines = []
-        for infobox in getattr(search.result_container, 'infoboxes', [])[:3]:
+        for ib in infoboxes:
-            ib_name = infobox.get('infobox', '') or infobox.get('title', '')
+            ib_name = ib.get('name', '') or ib.get('infobox', '') or ib.get('title', '')
-            ib_content = str(infobox.get('content', '')).replace('\n', ' ').strip()
+            ib_content = str(ib.get('content', '')).replace('\n', ' ').strip()
            if ib_name:
                parts = [f"INFOBOX [{ib_name}]:"]
                if ib_content:
-                    parts.append(ib_content[:600])
+                    parts.append(ib_content)
-                for attr in infobox.get('attributes', [])[:5]:
+                for attr in ib.get('attributes', []):
                    attr_label = attr.get('label', '')
                    attr_value = attr.get('value', '')
                    if attr_label and attr_value:
@@ -878,41 +852,46 @@ class SXNGPlugin(Plugin):
                knowledge_graph_lines.append(" ".join(parts) if len(parts) == 2 else "\n".join(parts))
-        for answer in getattr(search.result_container, 'answers', []):
+        for ans_text in answers:
-            if hasattr(answer, 'answer'):
+            if ans_text and not str(ans_text).startswith('<'):
-                ans_text = str(answer.answer).replace('\n', ' ').strip()[:300]
+                knowledge_graph_lines.append(f"ANSWER: {str(ans_text)[:300]}")
                ans_url = getattr(answer, 'url', '')
                if ans_text and not ans_text.startswith('<'):
                    knowledge_graph_lines.append(f"ANSWER: {ans_text}" + (f" [via {ans_url}]" if ans_url else ""))
        if knowledge_graph_lines:
            context_parts.append("KNOWLEDGE GRAPH:\n" + "\n".join(knowledge_graph_lines))
        # Deep sources: full content
        deep_lines = []
        for i, r in enumerate(raw_results[:self.context_deep_count]):
-            domain = urlparse(r.get('url', '')).netloc
+            url = r.get('url', '')
            result_urls.append(url)
            domain = urlparse(url).netloc.replace('www.', '')
            date = r.get('publishedDate')
            date_str = f" ({date})" if date else ""
            title = (r.get('title') or "").replace('\n', ' ').strip()
            content = str(r.get('content', '')).replace('\n', ' ').strip()[:800]
-            deep_lines.append(f"[{i+1}] {domain}{date_str}: {title}: {content}")
+            idx = i + 1 + offset
            deep_lines.append(f"[{idx}] {domain}{date_str}: {title}: {content}")
        if deep_lines:
            context_parts.append("DEEP SOURCES:\n" + "\n".join(deep_lines))
        # Shallow sources: headlines only
        if self.context_shallow_count > 0:
            shallow_lines = []
            start_idx = self.context_deep_count
            end_idx = self.context_deep_count + self.context_shallow_count
            for i, r in enumerate(raw_results[start_idx:end_idx]):
-                domain = urlparse(r.get('url', '')).netloc.replace('www.', '')
+                url = r.get('url', '')
                result_urls.append(url)
                domain = urlparse(url).netloc.replace('www.', '')
                title = (r.get('title') or '').replace('\n', ' ').strip()[:60]
-                shallow_lines.append(f"[{i+1+start_idx}] {domain}: {title}")
+                idx = i + 1 + start_idx + offset
                shallow_lines.append(f"[{idx}] {domain}: {title}")
            if shallow_lines:
                context_parts.append("SHALLOW SOURCES (headlines):\n" + "\n".join(shallow_lines))
-        return "\n\n".join(context_parts)
+        return "\n\n".join(context_parts), result_urls
    def post_search(self, request: "SXNG_Request", search: "SearchWithPlugins") -> EngineResults:
        results = EngineResults()
@@ -927,7 +906,12 @@ class SXNGPlugin(Plugin):
                return results
            raw_results = search.result_container.get_ordered_results()
-            context_str = self._assemble_context(search, raw_results)
+            raw_infoboxes = getattr(search.result_container, 'infoboxes', [])
            raw_answers = getattr(search.result_container, 'answers', [])
            # Normalize for unified context assembly
            _, infoboxes, answers = self._parse_aux_results(raw_results, raw_infoboxes, raw_answers)
            context_str, _ = self._assemble_context(raw_results, infoboxes, answers)
            ts = str(int(time.time()))
@@ -259,6 +259,7 @@ def run_tests():
            class MockResultContainer:
                def __init__(self):
                    self.answers = set()
                    self.infoboxes = []
                def get_ordered_results(self):
                    return [
                        {"title": "T1", "content": "C1", "url": "https://a.com/1", "publishedDate": "2026-01-15"},