merge context gathering methods

This commit is contained in:
cra88y/pc
2026-01-20 22:34:36 -06:00
parent 25053109e2
commit 4e567ce4e2
2 changed files with 46 additions and 61 deletions
+45 -61
View File
@@ -526,54 +526,25 @@ class SXNGPlugin(Plugin):
'publishedDate': r.get('publishedDate', '') 'publishedDate': r.get('publishedDate', '')
}) })
# SearXNG already merges infoboxes by ID - take first with full content
infoboxes = [] infoboxes = []
for ib in raw_infoboxes[:2]: for ib in raw_infoboxes[:1]:
infoboxes.append({ infoboxes.append({
'name': ib.get('infobox', '') or ib.get('title', ''), 'name': ib.get('infobox', '') or ib.get('title', ''),
'content': ib.get('content', '')[:400], 'content': ib.get('content', '')[:2000],
'attributes': ib.get('attributes', [])[:3] 'attributes': ib.get('attributes', [])
}) })
answers = [a.get('answer', '') for a in raw_answers[:2] # Only extract simple Answer types (skip Translations, WeatherAnswer etc.)
if a.get('answer') and not str(a.get('answer')).startswith('<')] answers = []
for a in list(raw_answers)[:2]:
if hasattr(a, 'answer') and isinstance(getattr(a, 'answer', None), str):
answers.append(a.answer)
elif isinstance(a, dict) and a.get('answer'):
answers.append(str(a['answer']))
return results, infoboxes, answers return results, infoboxes, answers
def _format_aux_context_string(self, results, infoboxes, answers, offset):
sections = []
aux_urls = []
kg_lines = []
for ib in infoboxes:
if ib.get('name'):
content = ib.get('content', '')
kg_lines.append(f"INFOBOX [{ib['name']}]: {content}")
for a in answers:
if a:
kg_lines.append(f"ANSWER: {a}")
if kg_lines:
sections.append('KNOWLEDGE GRAPH:\n' + '\n'.join(kg_lines))
source_lines = []
for i, r in enumerate(results):
url = r.get('url', '')
aux_urls.append(url)
# Match JS logic: domain extraction
domain = urlparse(url).netloc.replace('www.', '')
date = f" ({r['publishedDate']})" if r.get('publishedDate') else ''
title = r.get('title', '')
content = r.get('content', '')[:600]
# [index] domain(date): title: content
idx = i + 1 + offset
source_lines.append(f"[{idx}] {domain}{date}: {title}: {content}")
if source_lines:
sections.append('SOURCES:\n' + '\n'.join(source_lines))
return "\n\n".join(sections), aux_urls
def init(self, app): def init(self, app):
@@ -625,7 +596,7 @@ class SXNGPlugin(Plugin):
results, infoboxes, answers = self._parse_aux_results(raw_results, raw_infoboxes, raw_answers) results, infoboxes, answers = self._parse_aux_results(raw_results, raw_infoboxes, raw_answers)
context_str, new_urls = self._format_aux_context_string(results, infoboxes, answers, offset) context_str, new_urls = self._assemble_context(results, infoboxes, answers, offset)
return jsonify({ return jsonify({
'context': context_str, 'context': context_str,
@@ -663,7 +634,7 @@ class SXNGPlugin(Plugin):
search_data.get('answers', []) search_data.get('answers', [])
) )
context_str, new_urls = self._format_aux_context_string(results, infoboxes, answers, offset) context_str, new_urls = self._assemble_context(results, infoboxes, answers, offset)
return jsonify({ return jsonify({
'context': context_str, 'context': context_str,
@@ -858,19 +829,22 @@ class SXNGPlugin(Plugin):
}) })
return True return True
def _assemble_context(self, search, raw_results) -> str: def _assemble_context(self, raw_results, infoboxes, answers, offset=0) -> tuple[str, list]:
"""Builds three-tier context string from search results.""" """Builds context string from normalized search data. Returns (context_str, urls)."""
context_parts = [] context_parts = []
result_urls = []
# Knowledge graph
knowledge_graph_lines = [] knowledge_graph_lines = []
for infobox in getattr(search.result_container, 'infoboxes', [])[:3]: for ib in infoboxes:
ib_name = infobox.get('infobox', '') or infobox.get('title', '') ib_name = ib.get('name', '') or ib.get('infobox', '') or ib.get('title', '')
ib_content = str(infobox.get('content', '')).replace('\n', ' ').strip() ib_content = str(ib.get('content', '')).replace('\n', ' ').strip()
if ib_name: if ib_name:
parts = [f"INFOBOX [{ib_name}]:"] parts = [f"INFOBOX [{ib_name}]:"]
if ib_content: if ib_content:
parts.append(ib_content[:600]) parts.append(ib_content)
for attr in infobox.get('attributes', [])[:5]: for attr in ib.get('attributes', []):
attr_label = attr.get('label', '') attr_label = attr.get('label', '')
attr_value = attr.get('value', '') attr_value = attr.get('value', '')
if attr_label and attr_value: if attr_label and attr_value:
@@ -878,41 +852,46 @@ class SXNGPlugin(Plugin):
knowledge_graph_lines.append(" ".join(parts) if len(parts) == 2 else "\n".join(parts)) knowledge_graph_lines.append(" ".join(parts) if len(parts) == 2 else "\n".join(parts))
for answer in getattr(search.result_container, 'answers', []): for ans_text in answers:
if hasattr(answer, 'answer'): if ans_text and not str(ans_text).startswith('<'):
ans_text = str(answer.answer).replace('\n', ' ').strip()[:300] knowledge_graph_lines.append(f"ANSWER: {str(ans_text)[:300]}")
ans_url = getattr(answer, 'url', '')
if ans_text and not ans_text.startswith('<'):
knowledge_graph_lines.append(f"ANSWER: {ans_text}" + (f" [via {ans_url}]" if ans_url else ""))
if knowledge_graph_lines: if knowledge_graph_lines:
context_parts.append("KNOWLEDGE GRAPH:\n" + "\n".join(knowledge_graph_lines)) context_parts.append("KNOWLEDGE GRAPH:\n" + "\n".join(knowledge_graph_lines))
# Deep sources: full content
deep_lines = [] deep_lines = []
for i, r in enumerate(raw_results[:self.context_deep_count]): for i, r in enumerate(raw_results[:self.context_deep_count]):
domain = urlparse(r.get('url', '')).netloc url = r.get('url', '')
result_urls.append(url)
domain = urlparse(url).netloc.replace('www.', '')
date = r.get('publishedDate') date = r.get('publishedDate')
date_str = f" ({date})" if date else "" date_str = f" ({date})" if date else ""
title = (r.get('title') or "").replace('\n', ' ').strip() title = (r.get('title') or "").replace('\n', ' ').strip()
content = str(r.get('content', '')).replace('\n', ' ').strip()[:800] content = str(r.get('content', '')).replace('\n', ' ').strip()[:800]
deep_lines.append(f"[{i+1}] {domain}{date_str}: {title}: {content}") idx = i + 1 + offset
deep_lines.append(f"[{idx}] {domain}{date_str}: {title}: {content}")
if deep_lines: if deep_lines:
context_parts.append("DEEP SOURCES:\n" + "\n".join(deep_lines)) context_parts.append("DEEP SOURCES:\n" + "\n".join(deep_lines))
# Shallow sources: headlines only
if self.context_shallow_count > 0: if self.context_shallow_count > 0:
shallow_lines = [] shallow_lines = []
start_idx = self.context_deep_count start_idx = self.context_deep_count
end_idx = self.context_deep_count + self.context_shallow_count end_idx = self.context_deep_count + self.context_shallow_count
for i, r in enumerate(raw_results[start_idx:end_idx]): for i, r in enumerate(raw_results[start_idx:end_idx]):
domain = urlparse(r.get('url', '')).netloc.replace('www.', '') url = r.get('url', '')
result_urls.append(url)
domain = urlparse(url).netloc.replace('www.', '')
title = (r.get('title') or '').replace('\n', ' ').strip()[:60] title = (r.get('title') or '').replace('\n', ' ').strip()[:60]
shallow_lines.append(f"[{i+1+start_idx}] {domain}: {title}") idx = i + 1 + start_idx + offset
shallow_lines.append(f"[{idx}] {domain}: {title}")
if shallow_lines: if shallow_lines:
context_parts.append("SHALLOW SOURCES (headlines):\n" + "\n".join(shallow_lines)) context_parts.append("SHALLOW SOURCES (headlines):\n" + "\n".join(shallow_lines))
return "\n\n".join(context_parts) return "\n\n".join(context_parts), result_urls
def post_search(self, request: "SXNG_Request", search: "SearchWithPlugins") -> EngineResults: def post_search(self, request: "SXNG_Request", search: "SearchWithPlugins") -> EngineResults:
results = EngineResults() results = EngineResults()
@@ -927,7 +906,12 @@ class SXNGPlugin(Plugin):
return results return results
raw_results = search.result_container.get_ordered_results() raw_results = search.result_container.get_ordered_results()
context_str = self._assemble_context(search, raw_results) raw_infoboxes = getattr(search.result_container, 'infoboxes', [])
raw_answers = getattr(search.result_container, 'answers', [])
# Normalize for unified context assembly
_, infoboxes, answers = self._parse_aux_results(raw_results, raw_infoboxes, raw_answers)
context_str, _ = self._assemble_context(raw_results, infoboxes, answers)
ts = str(int(time.time())) ts = str(int(time.time()))
+1
View File
@@ -259,6 +259,7 @@ def run_tests():
class MockResultContainer: class MockResultContainer:
def __init__(self): def __init__(self):
self.answers = set() self.answers = set()
self.infoboxes = []
def get_ordered_results(self): def get_ordered_results(self):
return [ return [
{"title": "T1", "content": "C1", "url": "https://a.com/1", "publishedDate": "2026-01-15"}, {"title": "T1", "content": "C1", "url": "https://a.com/1", "publishedDate": "2026-01-15"},