""" Work Item Processor Handles processing of Azure DevOps work items and UUF items """ import re import html import requests from typing import Dict, Any, Optional, Tuple from urllib.parse import urlparse from .utils import WorkItemFieldExtractor # User agent for web requests USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' class WorkItemProcessor: """Processor for extracting and validating work item data with advanced parsing""" def __init__(self, logger, config: Dict[str, Any] = None): self.logger = logger self.log = logger.log if hasattr(logger, 'log') else logger self.config = config or {} def process_work_item(self, work_item: Dict[str, Any]) -> Optional[Dict[str, Any]]: """Process a single work item to extract required fields with advanced validation""" try: work_item_id = work_item['id'] title = work_item.get('fields', {}).get('System.Title', 'No Title') description = work_item.get('fields', {}).get('System.Description', '') if not description: self.log(f"Work item {work_item_id} has no description, skipping") return None # Parse description for required fields parsed_data = self._parse_description(description) if not parsed_data: self.log(f"Work item {work_item_id} doesn't contain required fields, skipping") return None # Validate nature of request (check for both variations) nature_lower = parsed_data['nature_of_request'].lower() if not ("modify existing docs" in nature_lower or "modifying existing docs" in nature_lower): self.log(f"Work item {work_item_id} nature of request doesn't contain 'modify existing docs', skipping") return None # Extract GitHub info from document URL github_info = self._extract_github_info(parsed_data['mydoc_url']) # If the document does not include an original_content_git_url, skip this work item if not github_info.get('original_content_git_url'): self.log(f"Work item {work_item_id} skipped: original_content_git_url not found in document {parsed_data['mydoc_url']}") return None # Construct proper web URL for work item # The API returns something like: https://dev.azure.com/org/project/_apis/wit/workItems/123 # We need to convert it to: https://dev.azure.com/org/project/_workitems/edit/123 work_item_url = '' api_url = work_item.get('url', '') if api_url: # Convert API URL to web URL # Replace /_apis/wit/workItems/ with /_workitems/edit/ work_item_url = api_url.replace('/_apis/wit/workItems/', '/_workitems/edit/') processed_item = { 'id': work_item_id, 'title': title, 'nature_of_request': parsed_data['nature_of_request'], 'mydoc_url': parsed_data['mydoc_url'], 'text_to_change': parsed_data['text_to_change'], 'new_text': parsed_data['new_text'], 'github_info': github_info, 'status': 'Ready', 'source': 'Azure DevOps', 'source_url': work_item_url, # URL to Azure DevOps work item 'original_new_text': parsed_data['new_text'] # Keep original for reference } self.log(f"Successfully processed work item {work_item_id}") return processed_item except Exception as e: self.log(f"Error processing work item {work_item.get('id', 'unknown')}: {str(e)}") return None def process_uuf_item(self, uuf_item: Dict[str, Any]) -> Optional[Dict[str, Any]]: """Process a single UUF item from Dataverse/PowerApp with enhanced field mapping""" try: # Extract UUF item ID (adjust field name as needed) uuf_id = uuf_item.get('cr4af_uufid') or uuf_item.get('cr4af_name') or 'unknown' # Extract title title = uuf_item.get('cr4af_title') or uuf_item.get('cr4af_subject') or 'No Title' # Extract description/details description = uuf_item.get('cr4af_description') or uuf_item.get('cr4af_details') or '' if not description: self.log(f"UUF item {uuf_id} has no description, skipping") return None # Extract document URL doc_url = uuf_item.get('cr4af_documenturl') or uuf_item.get('cr4af_docurl') or '' if not doc_url: self.log(f"UUF item {uuf_id} has no document URL, skipping") return None # Extract text to change and new text text_to_change = uuf_item.get('cr4af_texttochange') or uuf_item.get('cr4af_currenttext') or '' new_text = uuf_item.get('cr4af_proposednewtext') or uuf_item.get('cr4af_newtext') or '' if not text_to_change or not new_text: self.log(f"UUF item {uuf_id} missing text fields, skipping") return None # Extract GitHub info from document URL github_info = self._extract_github_info(doc_url) # If the document does not include an original_content_git_url, skip this item if not github_info.get('original_content_git_url'): self.log(f"UUF item {uuf_id} skipped: original_content_git_url not found in document {doc_url}") return None # Get UUF item URL if available (e.g., from Dataverse) uuf_url = uuf_item.get('cr4af_itemurl', '') or uuf_item.get('cr4af_url', '') processed_item = { 'id': uuf_id, 'title': title, 'nature_of_request': 'UUF Item - Modify existing docs', 'mydoc_url': doc_url, 'text_to_change': text_to_change, 'new_text': new_text, 'github_info': github_info, 'status': 'Ready', 'source': 'UUF', # Mark as UUF item 'source_url': uuf_url, # URL to UUF item (if available) 'original_new_text': new_text } self.log(f"Successfully processed UUF item {uuf_id}") return processed_item except Exception as e: self.log(f"Error processing UUF item {uuf_item.get('cr4af_uufid', 'unknown')}: {str(e)}") return None def _parse_description(self, description: str) -> Optional[Dict[str, Any]]: """Parse work item description to extract required fields using enhanced regex patterns""" # Enhanced regex patterns from regex_V5 patterns = { 'nature_of_request': r'nature\s+of\s+request[:\s]*([^\)]*\))', 'link_to_doc': r'link\s+to\s+doc[:\s]*([^\s&]+)', 'text_to_change': r'text\s+to\s+change[:\s]*([\s\S]*?)(?=\n*-+\s*Proposed new text|If adding brand new docs:|$)', 'proposed_new_text': r'proposed\s+new\s+text[:\s]*([\s\S]+?)(?=\s*If\s+adding\s+brand\s+new\s+docs:)' } # Clean HTML tags if present clean_description = re.sub(r'<[^>]+>', '', description) # Convert HTML entities to characters (e.g., " to ", & to &) clean_description = html.unescape(clean_description) extracted = {} for field, pattern in patterns.items(): match = re.search(pattern, clean_description, re.IGNORECASE | re.DOTALL) if match: value = match.group(1).strip() if field == 'nature_of_request': extracted['nature_of_request'] = value elif field == 'link_to_doc': extracted['mydoc_url'] = value.rstrip('-') elif field == 'text_to_change': extracted['text_to_change'] = value elif field == 'proposed_new_text': extracted['new_text'] = value # If enhanced patterns don't work, fall back to basic patterns if not all(field in extracted for field in ['nature_of_request', 'mydoc_url', 'text_to_change', 'new_text']): basic_patterns = { 'nature_of_request': r'nature\s+of\s+request[:\s]*([^\n]+)', 'link_to_doc': r'link\s+to\s+doc[:\s]*([^\s]+)', 'text_to_change': r'text\s+to\s+change[:\s]*(.+?)(?=proposed\s+new\s+text|$)', 'proposed_new_text': r'proposed\s+new\s+text[:\s]*(.+?)(?=\n\n|$)' } extracted = {} for field, pattern in basic_patterns.items(): match = re.search(pattern, clean_description, re.IGNORECASE | re.DOTALL) if match: value = match.group(1).strip() if field == 'nature_of_request': extracted['nature_of_request'] = value elif field == 'link_to_doc': extracted['mydoc_url'] = value elif field == 'text_to_change': extracted['text_to_change'] = value elif field == 'proposed_new_text': extracted['new_text'] = value # Validate all required fields are present required_fields = ['nature_of_request', 'mydoc_url', 'text_to_change', 'new_text'] if not all(field in extracted for field in required_fields): return None return extracted def _extract_github_info(self, doc_url: str) -> Dict[str, Any]: """Extract GitHub repository info and ms.author from document URL If GITHUB_REPO is configured in .env, it will be used instead of the repo extracted from the document metadata. This allows you to create PRs in your fork while preserving the file path and ms.author from the original document. """ try: # Fetch the document headers = {'User-Agent': USER_AGENT} response = requests.get(doc_url, headers=headers, timeout=30) response.raise_for_status() html_content = response.text # Extract ms.author ms_author = self._extract_meta_tag(html_content, 'ms.author') # Extract original_content_git_url original_content_git_url = self._extract_meta_tag(html_content, 'original_content_git_url') if not original_content_git_url: # Try alternative extraction method match = re.search(r"original_content_git_url[\"\']?\s*[:=]\s*[\"\']([^\"']+)[\"']", html_content, re.IGNORECASE) if match: original_content_git_url = match.group(1).strip() if not original_content_git_url: raise ValueError("original_content_git_url not found in document") # Check if GITHUB_REPO is configured in .env # If it is, use that instead of the repo from the document configured_repo = self.config.get('GITHUB_REPO') if configured_repo and '/' in configured_repo: # Use the configured repository (e.g., "b-tsammons/fabric-docs-pr") parts = configured_repo.split('/', 1) owner = parts[0].strip() repo = parts[1].strip() self.log(f"Using configured GITHUB_REPO: {owner}/{repo} (overriding document metadata)") else: # Parse GitHub owner/repo from original_content_git_url (fallback to document metadata) owner, repo = self._parse_github_url(original_content_git_url) self.log(f"Using repository from document metadata: {owner}/{repo}") return { 'ms_author': ms_author, 'original_content_git_url': original_content_git_url, 'owner': owner, 'repo': repo } except Exception as e: self.log(f"Error extracting GitHub info from {doc_url}: {str(e)}") return { 'ms_author': None, 'original_content_git_url': None, 'owner': None, 'repo': None, 'error': str(e) } def _extract_meta_tag(self, html_content: str, name: str) -> Optional[str]: """Extract content from meta tag""" pattern = rf']*?\s)?(?:name|property)\s*=\s*["\'](?P{re.escape(name)})["\']\s+[^>]*?\bcontent\s*=\s*["\'](?P[^"\']+)["\'][^>]*?>' match = re.search(pattern, html_content, re.IGNORECASE) if match: return match.group('content').strip() return None def _parse_github_url(self, url: str) -> Tuple[str, str]: """Parse GitHub URL to extract owner and repo""" parsed = urlparse(url) if "github.com" not in parsed.netloc.lower(): raise ValueError(f"Not a GitHub URL: {url}") parts = [p for p in parsed.path.split("/") if p] if len(parts) < 2: raise ValueError(f"Unable to parse owner/repo from: {url}") return parts[0], parts[1]