merge context gathering methods
This commit is contained in:
+45
-61
@@ -526,54 +526,25 @@ class SXNGPlugin(Plugin):
|
|||||||
'publishedDate': r.get('publishedDate', '')
|
'publishedDate': r.get('publishedDate', '')
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# SearXNG already merges infoboxes by ID - take first with full content
|
||||||
infoboxes = []
|
infoboxes = []
|
||||||
for ib in raw_infoboxes[:2]:
|
for ib in raw_infoboxes[:1]:
|
||||||
infoboxes.append({
|
infoboxes.append({
|
||||||
'name': ib.get('infobox', '') or ib.get('title', ''),
|
'name': ib.get('infobox', '') or ib.get('title', ''),
|
||||||
'content': ib.get('content', '')[:400],
|
'content': ib.get('content', '')[:2000],
|
||||||
'attributes': ib.get('attributes', [])[:3]
|
'attributes': ib.get('attributes', [])
|
||||||
})
|
})
|
||||||
|
|
||||||
answers = [a.get('answer', '') for a in raw_answers[:2]
|
# Only extract simple Answer types (skip Translations, WeatherAnswer etc.)
|
||||||
if a.get('answer') and not str(a.get('answer')).startswith('<')]
|
answers = []
|
||||||
|
for a in list(raw_answers)[:2]:
|
||||||
|
if hasattr(a, 'answer') and isinstance(getattr(a, 'answer', None), str):
|
||||||
|
answers.append(a.answer)
|
||||||
|
elif isinstance(a, dict) and a.get('answer'):
|
||||||
|
answers.append(str(a['answer']))
|
||||||
|
|
||||||
return results, infoboxes, answers
|
return results, infoboxes, answers
|
||||||
|
|
||||||
def _format_aux_context_string(self, results, infoboxes, answers, offset):
|
|
||||||
sections = []
|
|
||||||
aux_urls = []
|
|
||||||
|
|
||||||
kg_lines = []
|
|
||||||
for ib in infoboxes:
|
|
||||||
if ib.get('name'):
|
|
||||||
content = ib.get('content', '')
|
|
||||||
kg_lines.append(f"INFOBOX [{ib['name']}]: {content}")
|
|
||||||
|
|
||||||
for a in answers:
|
|
||||||
if a:
|
|
||||||
kg_lines.append(f"ANSWER: {a}")
|
|
||||||
|
|
||||||
if kg_lines:
|
|
||||||
sections.append('KNOWLEDGE GRAPH:\n' + '\n'.join(kg_lines))
|
|
||||||
|
|
||||||
source_lines = []
|
|
||||||
for i, r in enumerate(results):
|
|
||||||
url = r.get('url', '')
|
|
||||||
aux_urls.append(url)
|
|
||||||
# Match JS logic: domain extraction
|
|
||||||
domain = urlparse(url).netloc.replace('www.', '')
|
|
||||||
date = f" ({r['publishedDate']})" if r.get('publishedDate') else ''
|
|
||||||
title = r.get('title', '')
|
|
||||||
content = r.get('content', '')[:600]
|
|
||||||
# [index] domain(date): title: content
|
|
||||||
idx = i + 1 + offset
|
|
||||||
source_lines.append(f"[{idx}] {domain}{date}: {title}: {content}")
|
|
||||||
|
|
||||||
if source_lines:
|
|
||||||
sections.append('SOURCES:\n' + '\n'.join(source_lines))
|
|
||||||
|
|
||||||
return "\n\n".join(sections), aux_urls
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def init(self, app):
|
def init(self, app):
|
||||||
@@ -625,7 +596,7 @@ class SXNGPlugin(Plugin):
|
|||||||
|
|
||||||
results, infoboxes, answers = self._parse_aux_results(raw_results, raw_infoboxes, raw_answers)
|
results, infoboxes, answers = self._parse_aux_results(raw_results, raw_infoboxes, raw_answers)
|
||||||
|
|
||||||
context_str, new_urls = self._format_aux_context_string(results, infoboxes, answers, offset)
|
context_str, new_urls = self._assemble_context(results, infoboxes, answers, offset)
|
||||||
|
|
||||||
return jsonify({
|
return jsonify({
|
||||||
'context': context_str,
|
'context': context_str,
|
||||||
@@ -663,7 +634,7 @@ class SXNGPlugin(Plugin):
|
|||||||
search_data.get('answers', [])
|
search_data.get('answers', [])
|
||||||
)
|
)
|
||||||
|
|
||||||
context_str, new_urls = self._format_aux_context_string(results, infoboxes, answers, offset)
|
context_str, new_urls = self._assemble_context(results, infoboxes, answers, offset)
|
||||||
|
|
||||||
return jsonify({
|
return jsonify({
|
||||||
'context': context_str,
|
'context': context_str,
|
||||||
@@ -858,19 +829,22 @@ class SXNGPlugin(Plugin):
|
|||||||
})
|
})
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _assemble_context(self, search, raw_results) -> str:
|
def _assemble_context(self, raw_results, infoboxes, answers, offset=0) -> tuple[str, list]:
|
||||||
"""Builds three-tier context string from search results."""
|
"""Builds context string from normalized search data. Returns (context_str, urls)."""
|
||||||
context_parts = []
|
context_parts = []
|
||||||
|
result_urls = []
|
||||||
|
|
||||||
|
# Knowledge graph
|
||||||
knowledge_graph_lines = []
|
knowledge_graph_lines = []
|
||||||
for infobox in getattr(search.result_container, 'infoboxes', [])[:3]:
|
for ib in infoboxes:
|
||||||
ib_name = infobox.get('infobox', '') or infobox.get('title', '')
|
ib_name = ib.get('name', '') or ib.get('infobox', '') or ib.get('title', '')
|
||||||
ib_content = str(infobox.get('content', '')).replace('\n', ' ').strip()
|
ib_content = str(ib.get('content', '')).replace('\n', ' ').strip()
|
||||||
|
|
||||||
if ib_name:
|
if ib_name:
|
||||||
parts = [f"INFOBOX [{ib_name}]:"]
|
parts = [f"INFOBOX [{ib_name}]:"]
|
||||||
if ib_content:
|
if ib_content:
|
||||||
parts.append(ib_content[:600])
|
parts.append(ib_content)
|
||||||
for attr in infobox.get('attributes', [])[:5]:
|
for attr in ib.get('attributes', []):
|
||||||
attr_label = attr.get('label', '')
|
attr_label = attr.get('label', '')
|
||||||
attr_value = attr.get('value', '')
|
attr_value = attr.get('value', '')
|
||||||
if attr_label and attr_value:
|
if attr_label and attr_value:
|
||||||
@@ -878,41 +852,46 @@ class SXNGPlugin(Plugin):
|
|||||||
|
|
||||||
knowledge_graph_lines.append(" ".join(parts) if len(parts) == 2 else "\n".join(parts))
|
knowledge_graph_lines.append(" ".join(parts) if len(parts) == 2 else "\n".join(parts))
|
||||||
|
|
||||||
for answer in getattr(search.result_container, 'answers', []):
|
for ans_text in answers:
|
||||||
if hasattr(answer, 'answer'):
|
if ans_text and not str(ans_text).startswith('<'):
|
||||||
ans_text = str(answer.answer).replace('\n', ' ').strip()[:300]
|
knowledge_graph_lines.append(f"ANSWER: {str(ans_text)[:300]}")
|
||||||
ans_url = getattr(answer, 'url', '')
|
|
||||||
if ans_text and not ans_text.startswith('<'):
|
|
||||||
knowledge_graph_lines.append(f"ANSWER: {ans_text}" + (f" [via {ans_url}]" if ans_url else ""))
|
|
||||||
|
|
||||||
if knowledge_graph_lines:
|
if knowledge_graph_lines:
|
||||||
context_parts.append("KNOWLEDGE GRAPH:\n" + "\n".join(knowledge_graph_lines))
|
context_parts.append("KNOWLEDGE GRAPH:\n" + "\n".join(knowledge_graph_lines))
|
||||||
|
|
||||||
|
# Deep sources: full content
|
||||||
deep_lines = []
|
deep_lines = []
|
||||||
for i, r in enumerate(raw_results[:self.context_deep_count]):
|
for i, r in enumerate(raw_results[:self.context_deep_count]):
|
||||||
domain = urlparse(r.get('url', '')).netloc
|
url = r.get('url', '')
|
||||||
|
result_urls.append(url)
|
||||||
|
domain = urlparse(url).netloc.replace('www.', '')
|
||||||
date = r.get('publishedDate')
|
date = r.get('publishedDate')
|
||||||
date_str = f" ({date})" if date else ""
|
date_str = f" ({date})" if date else ""
|
||||||
title = (r.get('title') or "").replace('\n', ' ').strip()
|
title = (r.get('title') or "").replace('\n', ' ').strip()
|
||||||
content = str(r.get('content', '')).replace('\n', ' ').strip()[:800]
|
content = str(r.get('content', '')).replace('\n', ' ').strip()[:800]
|
||||||
deep_lines.append(f"[{i+1}] {domain}{date_str}: {title}: {content}")
|
idx = i + 1 + offset
|
||||||
|
deep_lines.append(f"[{idx}] {domain}{date_str}: {title}: {content}")
|
||||||
|
|
||||||
if deep_lines:
|
if deep_lines:
|
||||||
context_parts.append("DEEP SOURCES:\n" + "\n".join(deep_lines))
|
context_parts.append("DEEP SOURCES:\n" + "\n".join(deep_lines))
|
||||||
|
|
||||||
|
# Shallow sources: headlines only
|
||||||
if self.context_shallow_count > 0:
|
if self.context_shallow_count > 0:
|
||||||
shallow_lines = []
|
shallow_lines = []
|
||||||
start_idx = self.context_deep_count
|
start_idx = self.context_deep_count
|
||||||
end_idx = self.context_deep_count + self.context_shallow_count
|
end_idx = self.context_deep_count + self.context_shallow_count
|
||||||
for i, r in enumerate(raw_results[start_idx:end_idx]):
|
for i, r in enumerate(raw_results[start_idx:end_idx]):
|
||||||
domain = urlparse(r.get('url', '')).netloc.replace('www.', '')
|
url = r.get('url', '')
|
||||||
|
result_urls.append(url)
|
||||||
|
domain = urlparse(url).netloc.replace('www.', '')
|
||||||
title = (r.get('title') or '').replace('\n', ' ').strip()[:60]
|
title = (r.get('title') or '').replace('\n', ' ').strip()[:60]
|
||||||
shallow_lines.append(f"[{i+1+start_idx}] {domain}: {title}")
|
idx = i + 1 + start_idx + offset
|
||||||
|
shallow_lines.append(f"[{idx}] {domain}: {title}")
|
||||||
|
|
||||||
if shallow_lines:
|
if shallow_lines:
|
||||||
context_parts.append("SHALLOW SOURCES (headlines):\n" + "\n".join(shallow_lines))
|
context_parts.append("SHALLOW SOURCES (headlines):\n" + "\n".join(shallow_lines))
|
||||||
|
|
||||||
return "\n\n".join(context_parts)
|
return "\n\n".join(context_parts), result_urls
|
||||||
|
|
||||||
def post_search(self, request: "SXNG_Request", search: "SearchWithPlugins") -> EngineResults:
|
def post_search(self, request: "SXNG_Request", search: "SearchWithPlugins") -> EngineResults:
|
||||||
results = EngineResults()
|
results = EngineResults()
|
||||||
@@ -927,7 +906,12 @@ class SXNGPlugin(Plugin):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
raw_results = search.result_container.get_ordered_results()
|
raw_results = search.result_container.get_ordered_results()
|
||||||
context_str = self._assemble_context(search, raw_results)
|
raw_infoboxes = getattr(search.result_container, 'infoboxes', [])
|
||||||
|
raw_answers = getattr(search.result_container, 'answers', [])
|
||||||
|
|
||||||
|
# Normalize for unified context assembly
|
||||||
|
_, infoboxes, answers = self._parse_aux_results(raw_results, raw_infoboxes, raw_answers)
|
||||||
|
context_str, _ = self._assemble_context(raw_results, infoboxes, answers)
|
||||||
|
|
||||||
|
|
||||||
ts = str(int(time.time()))
|
ts = str(int(time.time()))
|
||||||
|
|||||||
@@ -259,6 +259,7 @@ def run_tests():
|
|||||||
class MockResultContainer:
|
class MockResultContainer:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.answers = set()
|
self.answers = set()
|
||||||
|
self.infoboxes = []
|
||||||
def get_ordered_results(self):
|
def get_ordered_results(self):
|
||||||
return [
|
return [
|
||||||
{"title": "T1", "content": "C1", "url": "https://a.com/1", "publishedDate": "2026-01-15"},
|
{"title": "T1", "content": "C1", "url": "https://a.com/1", "publishedDate": "2026-01-15"},
|
||||||
|
|||||||
Reference in New Issue
Block a user