Moved the current files to pivate repo
This commit is contained in:
@@ -0,0 +1,291 @@
|
||||
"""
|
||||
Work Item Processor
|
||||
Handles processing of Azure DevOps work items and UUF items
|
||||
"""
|
||||
|
||||
import re
|
||||
import html
|
||||
import requests
|
||||
from typing import Dict, Any, Optional, Tuple
|
||||
from urllib.parse import urlparse
|
||||
from .utils import WorkItemFieldExtractor
|
||||
|
||||
# User agent for web requests
|
||||
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
|
||||
|
||||
class WorkItemProcessor:
|
||||
"""Processor for extracting and validating work item data with advanced parsing"""
|
||||
|
||||
def __init__(self, logger, config: Dict[str, Any] = None):
|
||||
self.logger = logger
|
||||
self.log = logger.log if hasattr(logger, 'log') else logger
|
||||
self.config = config or {}
|
||||
|
||||
def process_work_item(self, work_item: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
"""Process a single work item to extract required fields with advanced validation"""
|
||||
try:
|
||||
work_item_id = work_item['id']
|
||||
title = work_item.get('fields', {}).get('System.Title', 'No Title')
|
||||
description = work_item.get('fields', {}).get('System.Description', '')
|
||||
|
||||
if not description:
|
||||
self.log(f"Work item {work_item_id} has no description, skipping")
|
||||
return None
|
||||
|
||||
# Parse description for required fields
|
||||
parsed_data = self._parse_description(description)
|
||||
|
||||
if not parsed_data:
|
||||
self.log(f"Work item {work_item_id} doesn't contain required fields, skipping")
|
||||
return None
|
||||
|
||||
# Validate nature of request (check for both variations)
|
||||
nature_lower = parsed_data['nature_of_request'].lower()
|
||||
if not ("modify existing docs" in nature_lower or "modifying existing docs" in nature_lower):
|
||||
self.log(f"Work item {work_item_id} nature of request doesn't contain 'modify existing docs', skipping")
|
||||
return None
|
||||
|
||||
# Extract GitHub info from document URL
|
||||
github_info = self._extract_github_info(parsed_data['mydoc_url'])
|
||||
|
||||
# If the document does not include an original_content_git_url, skip this work item
|
||||
if not github_info.get('original_content_git_url'):
|
||||
self.log(f"Work item {work_item_id} skipped: original_content_git_url not found in document {parsed_data['mydoc_url']}")
|
||||
return None
|
||||
|
||||
# Construct proper web URL for work item
|
||||
# The API returns something like: https://dev.azure.com/org/project/_apis/wit/workItems/123
|
||||
# We need to convert it to: https://dev.azure.com/org/project/_workitems/edit/123
|
||||
work_item_url = ''
|
||||
api_url = work_item.get('url', '')
|
||||
if api_url:
|
||||
# Convert API URL to web URL
|
||||
# Replace /_apis/wit/workItems/ with /_workitems/edit/
|
||||
work_item_url = api_url.replace('/_apis/wit/workItems/', '/_workitems/edit/')
|
||||
|
||||
processed_item = {
|
||||
'id': work_item_id,
|
||||
'title': title,
|
||||
'nature_of_request': parsed_data['nature_of_request'],
|
||||
'mydoc_url': parsed_data['mydoc_url'],
|
||||
'text_to_change': parsed_data['text_to_change'],
|
||||
'new_text': parsed_data['new_text'],
|
||||
'github_info': github_info,
|
||||
'status': 'Ready',
|
||||
'source': 'Azure DevOps',
|
||||
'source_url': work_item_url, # URL to Azure DevOps work item
|
||||
'original_new_text': parsed_data['new_text'] # Keep original for reference
|
||||
}
|
||||
|
||||
self.log(f"Successfully processed work item {work_item_id}")
|
||||
return processed_item
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error processing work item {work_item.get('id', 'unknown')}: {str(e)}")
|
||||
return None
|
||||
|
||||
def process_uuf_item(self, uuf_item: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
"""Process a single UUF item from Dataverse/PowerApp with enhanced field mapping"""
|
||||
try:
|
||||
# Extract UUF item ID (adjust field name as needed)
|
||||
uuf_id = uuf_item.get('cr4af_uufid') or uuf_item.get('cr4af_name') or 'unknown'
|
||||
|
||||
# Extract title
|
||||
title = uuf_item.get('cr4af_title') or uuf_item.get('cr4af_subject') or 'No Title'
|
||||
|
||||
# Extract description/details
|
||||
description = uuf_item.get('cr4af_description') or uuf_item.get('cr4af_details') or ''
|
||||
|
||||
if not description:
|
||||
self.log(f"UUF item {uuf_id} has no description, skipping")
|
||||
return None
|
||||
|
||||
# Extract document URL
|
||||
doc_url = uuf_item.get('cr4af_documenturl') or uuf_item.get('cr4af_docurl') or ''
|
||||
|
||||
if not doc_url:
|
||||
self.log(f"UUF item {uuf_id} has no document URL, skipping")
|
||||
return None
|
||||
|
||||
# Extract text to change and new text
|
||||
text_to_change = uuf_item.get('cr4af_texttochange') or uuf_item.get('cr4af_currenttext') or ''
|
||||
new_text = uuf_item.get('cr4af_proposednewtext') or uuf_item.get('cr4af_newtext') or ''
|
||||
|
||||
if not text_to_change or not new_text:
|
||||
self.log(f"UUF item {uuf_id} missing text fields, skipping")
|
||||
return None
|
||||
|
||||
# Extract GitHub info from document URL
|
||||
github_info = self._extract_github_info(doc_url)
|
||||
|
||||
# If the document does not include an original_content_git_url, skip this item
|
||||
if not github_info.get('original_content_git_url'):
|
||||
self.log(f"UUF item {uuf_id} skipped: original_content_git_url not found in document {doc_url}")
|
||||
return None
|
||||
|
||||
# Get UUF item URL if available (e.g., from Dataverse)
|
||||
uuf_url = uuf_item.get('cr4af_itemurl', '') or uuf_item.get('cr4af_url', '')
|
||||
|
||||
processed_item = {
|
||||
'id': uuf_id,
|
||||
'title': title,
|
||||
'nature_of_request': 'UUF Item - Modify existing docs',
|
||||
'mydoc_url': doc_url,
|
||||
'text_to_change': text_to_change,
|
||||
'new_text': new_text,
|
||||
'github_info': github_info,
|
||||
'status': 'Ready',
|
||||
'source': 'UUF', # Mark as UUF item
|
||||
'source_url': uuf_url, # URL to UUF item (if available)
|
||||
'original_new_text': new_text
|
||||
}
|
||||
|
||||
self.log(f"Successfully processed UUF item {uuf_id}")
|
||||
return processed_item
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error processing UUF item {uuf_item.get('cr4af_uufid', 'unknown')}: {str(e)}")
|
||||
return None
|
||||
|
||||
def _parse_description(self, description: str) -> Optional[Dict[str, Any]]:
|
||||
"""Parse work item description to extract required fields using enhanced regex patterns"""
|
||||
# Enhanced regex patterns from regex_V5
|
||||
patterns = {
|
||||
'nature_of_request': r'nature\s+of\s+request[:\s]*([^\)]*\))',
|
||||
'link_to_doc': r'link\s+to\s+doc[:\s]*([^\s&]+)',
|
||||
'text_to_change': r'text\s+to\s+change[:\s]*([\s\S]*?)(?=\n*-+\s*Proposed new text|If adding brand new docs:|$)',
|
||||
'proposed_new_text': r'proposed\s+new\s+text[:\s]*([\s\S]+?)(?=\s*If\s+adding\s+brand\s+new\s+docs:)'
|
||||
}
|
||||
|
||||
# Clean HTML tags if present
|
||||
clean_description = re.sub(r'<[^>]+>', '', description)
|
||||
|
||||
# Convert HTML entities to characters (e.g., " to ", & to &)
|
||||
clean_description = html.unescape(clean_description)
|
||||
|
||||
extracted = {}
|
||||
for field, pattern in patterns.items():
|
||||
match = re.search(pattern, clean_description, re.IGNORECASE | re.DOTALL)
|
||||
if match:
|
||||
value = match.group(1).strip()
|
||||
|
||||
if field == 'nature_of_request':
|
||||
extracted['nature_of_request'] = value
|
||||
elif field == 'link_to_doc':
|
||||
extracted['mydoc_url'] = value.rstrip('-')
|
||||
elif field == 'text_to_change':
|
||||
extracted['text_to_change'] = value
|
||||
elif field == 'proposed_new_text':
|
||||
extracted['new_text'] = value
|
||||
|
||||
# If enhanced patterns don't work, fall back to basic patterns
|
||||
if not all(field in extracted for field in ['nature_of_request', 'mydoc_url', 'text_to_change', 'new_text']):
|
||||
basic_patterns = {
|
||||
'nature_of_request': r'nature\s+of\s+request[:\s]*([^\n]+)',
|
||||
'link_to_doc': r'link\s+to\s+doc[:\s]*([^\s]+)',
|
||||
'text_to_change': r'text\s+to\s+change[:\s]*(.+?)(?=proposed\s+new\s+text|$)',
|
||||
'proposed_new_text': r'proposed\s+new\s+text[:\s]*(.+?)(?=\n\n|$)'
|
||||
}
|
||||
|
||||
extracted = {}
|
||||
for field, pattern in basic_patterns.items():
|
||||
match = re.search(pattern, clean_description, re.IGNORECASE | re.DOTALL)
|
||||
if match:
|
||||
value = match.group(1).strip()
|
||||
|
||||
if field == 'nature_of_request':
|
||||
extracted['nature_of_request'] = value
|
||||
elif field == 'link_to_doc':
|
||||
extracted['mydoc_url'] = value
|
||||
elif field == 'text_to_change':
|
||||
extracted['text_to_change'] = value
|
||||
elif field == 'proposed_new_text':
|
||||
extracted['new_text'] = value
|
||||
|
||||
# Validate all required fields are present
|
||||
required_fields = ['nature_of_request', 'mydoc_url', 'text_to_change', 'new_text']
|
||||
if not all(field in extracted for field in required_fields):
|
||||
return None
|
||||
|
||||
return extracted
|
||||
|
||||
def _extract_github_info(self, doc_url: str) -> Dict[str, Any]:
|
||||
"""Extract GitHub repository info and ms.author from document URL
|
||||
|
||||
If GITHUB_REPO is configured in .env, it will be used instead of the repo
|
||||
extracted from the document metadata. This allows you to create PRs in your
|
||||
fork while preserving the file path and ms.author from the original document.
|
||||
"""
|
||||
try:
|
||||
# Fetch the document
|
||||
headers = {'User-Agent': USER_AGENT}
|
||||
response = requests.get(doc_url, headers=headers, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
html_content = response.text
|
||||
|
||||
# Extract ms.author
|
||||
ms_author = self._extract_meta_tag(html_content, 'ms.author')
|
||||
|
||||
# Extract original_content_git_url
|
||||
original_content_git_url = self._extract_meta_tag(html_content, 'original_content_git_url')
|
||||
|
||||
if not original_content_git_url:
|
||||
# Try alternative extraction method
|
||||
match = re.search(r"original_content_git_url[\"\']?\s*[:=]\s*[\"\']([^\"']+)[\"']", html_content, re.IGNORECASE)
|
||||
if match:
|
||||
original_content_git_url = match.group(1).strip()
|
||||
|
||||
if not original_content_git_url:
|
||||
raise ValueError("original_content_git_url not found in document")
|
||||
|
||||
# Check if GITHUB_REPO is configured in .env
|
||||
# If it is, use that instead of the repo from the document
|
||||
configured_repo = self.config.get('GITHUB_REPO')
|
||||
|
||||
if configured_repo and '/' in configured_repo:
|
||||
# Use the configured repository (e.g., "b-tsammons/fabric-docs-pr")
|
||||
parts = configured_repo.split('/', 1)
|
||||
owner = parts[0].strip()
|
||||
repo = parts[1].strip()
|
||||
self.log(f"Using configured GITHUB_REPO: {owner}/{repo} (overriding document metadata)")
|
||||
else:
|
||||
# Parse GitHub owner/repo from original_content_git_url (fallback to document metadata)
|
||||
owner, repo = self._parse_github_url(original_content_git_url)
|
||||
self.log(f"Using repository from document metadata: {owner}/{repo}")
|
||||
|
||||
return {
|
||||
'ms_author': ms_author,
|
||||
'original_content_git_url': original_content_git_url,
|
||||
'owner': owner,
|
||||
'repo': repo
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.log(f"Error extracting GitHub info from {doc_url}: {str(e)}")
|
||||
return {
|
||||
'ms_author': None,
|
||||
'original_content_git_url': None,
|
||||
'owner': None,
|
||||
'repo': None,
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
def _extract_meta_tag(self, html_content: str, name: str) -> Optional[str]:
|
||||
"""Extract content from meta tag"""
|
||||
pattern = rf'<meta\s+(?:[^>]*?\s)?(?:name|property)\s*=\s*["\'](?P<n>{re.escape(name)})["\']\s+[^>]*?\bcontent\s*=\s*["\'](?P<content>[^"\']+)["\'][^>]*?>'
|
||||
match = re.search(pattern, html_content, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group('content').strip()
|
||||
return None
|
||||
|
||||
def _parse_github_url(self, url: str) -> Tuple[str, str]:
|
||||
"""Parse GitHub URL to extract owner and repo"""
|
||||
parsed = urlparse(url)
|
||||
if "github.com" not in parsed.netloc.lower():
|
||||
raise ValueError(f"Not a GitHub URL: {url}")
|
||||
parts = [p for p in parsed.path.split("/") if p]
|
||||
if len(parts) < 2:
|
||||
raise ValueError(f"Unable to parse owner/repo from: {url}")
|
||||
return parts[0], parts[1]
|
||||
Reference in New Issue
Block a user