#!/usr/bin/env python3 """ Enhanced Company Information Scraper with SearXNG Integration ---------------------------- A Python script to collect various company information including executives, contact details, and addresses using SearXNG as the search backend. Enhanced features: - Search for staff by specific titles - Collect contact information (phone, email, social media) - Multiple output modes (minimal, targeted, comprehensive) - Configurable data collection targets """ import argparse import csv import json import os import random import re import sys import time from datetime import datetime from urllib.parse import quote_plus, urlencode try: import requests from bs4 import BeautifulSoup from fake_useragent import UserAgent except ImportError: print("Required packages not found. Please install them with:") print("pip install requests beautifulsoup4 fake-useragent") sys.exit(1) # Configuration class Config: VERSION = "2.0.0" DEFAULT_TIMEOUT = 20 CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".cache") DEBUG_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".debug") RAW_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "raw_scrapes") # SearXNG configuration SEARXNG_URL = "http://localhost:8888/" # Search engines to use with SearXNG SEARCH_ENGINES = [ "google", "duckduckgo", "bing" ] # Search delay ranges (min, max) in seconds DELAY_BETWEEN_SEARCHES = (1, 3) # Can be lower with SearXNG DELAY_BETWEEN_COMPANIES = (2, 5) # Can be lower with SearXNG DELAY_BEFORE_SEARCH = (0.5, 1.5) # Can be lower with SearXNG # Retry configuration MAX_RETRIES = 3 RETRY_DELAY = (2, 5) # Can be lower with SearXNG # Available search types SEARCH_TYPES = { "ceo": "CEO information", "hq": "Headquarters address", "phone": "Phone numbers", "email": "Email addresses", "social": "Social media profiles", "staff": "Staff members by title", "contact": "General contact information", "mailing": "Mailing address" } # Minimal mode search types MINIMAL_SEARCH_TYPES = ["ceo", "hq"] # Default comprehensive search types (everything) COMPREHENSIVE_SEARCH_TYPES = list(SEARCH_TYPES.keys()) class EnhancedCompanyScraper: def __init__(self, args): self.args = args self.companies = [] self.results = [] self.session = requests.Session() # Determine which search types to use based on mode self.search_types = self.determine_search_types() self.setup_directories() # Check if SearXNG is running if not self.check_searxng(): print(f"Error: SearXNG not available at {Config.SEARXNG_URL}") print("Please make sure SearXNG is running before using this script.") print("You can start it with: docker-compose up -d") sys.exit(1) # Use fake-useragent to rotate user agents try: self.ua = UserAgent() except: # Fallback if fake-useragent fails self.ua = None print("Warning: fake-useragent failed to initialize. Using default user agent.") def determine_search_types(self): """Determine which search types to use based on mode and args""" search_types = [] # Start with default search types if self.args.mode == "minimal": search_types = Config.MINIMAL_SEARCH_TYPES.copy() elif self.args.mode == "comprehensive": search_types = Config.COMPREHENSIVE_SEARCH_TYPES.copy() elif self.args.mode == "targeted": # For targeted mode, use only what was specified if self.args.target_staff: search_types.append("staff") else: # If no staff title specified, default to CEO search_types.append("ceo") # Add any explicitly requested types if self.args.include_contact: search_types.extend(["phone", "email"]) if self.args.include_address: search_types.extend(["hq", "mailing"]) if self.args.include_social: search_types.append("social") # If nothing explicitly included, add headquarters if len(search_types) == 1: # Only staff/ceo search_types.append("hq") # Override with explicit includes/excludes if self.args.include_types: for type_name in self.args.include_types.split(','): type_name = type_name.strip() if type_name in Config.SEARCH_TYPES and type_name not in search_types: search_types.append(type_name) if self.args.exclude_types: for type_name in self.args.exclude_types.split(','): type_name = type_name.strip() if type_name in search_types: search_types.remove(type_name) # Log selected search types if self.args.verbose: print(f"Selected search types: {', '.join(search_types)}") return search_types def check_searxng(self): """Check if SearXNG is running and available""" if self.args.dry_run: return True try: response = requests.get(Config.SEARXNG_URL, timeout=5) return response.status_code == 200 except: return False def setup_directories(self): """Create necessary directories for caching and debugging""" # Create cache directories for all search types if self.args.use_cache: for search_type in Config.SEARCH_TYPES.keys(): os.makedirs(os.path.join(Config.CACHE_DIR, search_type), exist_ok=True) if self.args.debug: os.makedirs(Config.DEBUG_DIR, exist_ok=True) os.makedirs(os.path.join(Config.DEBUG_DIR, "extraction"), exist_ok=True) os.makedirs(os.path.join(Config.DEBUG_DIR, "patterns"), exist_ok=True) if self.args.save_raw: for search_type in Config.SEARCH_TYPES.keys(): os.makedirs(os.path.join(Config.RAW_DIR, search_type), exist_ok=True) def load_companies(self): """Load companies from file or stdin""" if self.args.input_file: try: with open(self.args.input_file, 'r') as f: for line in f: company = line.strip() if company: self.companies.append(company) except Exception as e: print(f"Error loading companies from file: {e}") sys.exit(1) else: print("Enter company names (one per line), press Ctrl+D when finished:") for line in sys.stdin: company = line.strip() if company: self.companies.append(company) if not self.companies: print("No companies provided!") sys.exit(1) print(f"Loaded {len(self.companies)} companies") def get_random_user_agent(self): """Get a random user agent""" if self.ua: return self.ua.random return "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" def get_searxng_url(self, query, search_type, engine): """Get SearXNG search URL for the given engine and search type""" query = quote_plus(query) search_terms = "" if search_type == "ceo": search_terms = "CEO who is the chief executive" elif search_type == "hq": search_terms = "headquarters address location where is" elif search_type == "phone": search_terms = "phone number contact" elif search_type == "email": search_terms = "email address contact" elif search_type == "social": search_terms = "social media profiles twitter linkedin facebook" elif search_type == "contact": search_terms = "contact information phone email" elif search_type == "mailing": search_terms = "mailing address postal" elif search_type == "staff": # For staff, include the target title in the search staff_title = self.args.target_staff or "executive team" search_terms = f"{staff_title} who is" # Build the full query full_query = f"{query} {search_terms}" # Prepare parameters for SearXNG params = { 'q': full_query, 'engines': engine, 'format': 'html', 'language': 'en-US' } # Build the URL url = f"{Config.SEARXNG_URL.rstrip('/')}/?{urlencode(params)}" return url def search_company(self, company, search_type): """Search for company information with specific search type""" clean_company = re.sub(r'[^a-zA-Z0-9_-]', '+', company) cache_file = os.path.join(Config.CACHE_DIR, search_type, f"{clean_company}.html") # Check cache first if enabled if self.args.use_cache and os.path.exists(cache_file): self.debug_log(f"Using cached data for {search_type} search", company, "extraction") with open(cache_file, 'r', encoding='utf-8') as f: return f.read() # Try each search engine until one succeeds for retry in range(Config.MAX_RETRIES): for engine in Config.SEARCH_ENGINES: if self.args.verbose: print(f"Searching for {company} {search_type} using SearXNG with {engine} (attempt {retry+1})") # Random delay before search delay = random.uniform(*Config.DELAY_BEFORE_SEARCH) if self.args.verbose: print(f"Waiting {delay:.2f} seconds before search...") time.sleep(delay) # Get the search URL url = self.get_searxng_url(company, search_type, engine) if self.args.dry_run: self.debug_log(f"Would search: {url}", company, "extraction") return "" # Prepare headers with random user agent headers = { "User-Agent": self.get_random_user_agent(), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1" } try: response = self.session.get( url, headers=headers, timeout=self.args.timeout ) # Check if the response is valid if response.status_code != 200: if self.args.verbose: print(f"Got status code {response.status_code} from SearXNG with {engine}") continue # Get the HTML content html_content = response.text # Save raw HTML if requested if self.args.save_raw: raw_file = os.path.join(Config.RAW_DIR, search_type, f"{clean_company}_{engine}.html") with open(raw_file, 'w', encoding='utf-8') as f: f.write(html_content) # Save to cache if enabled if self.args.use_cache: with open(cache_file, 'w', encoding='utf-8') as f: f.write(html_content) return html_content except Exception as e: if self.args.verbose: print(f"Error searching with SearXNG/{engine}: {e}") continue # If we've tried all engines and none worked, wait before retry if retry < Config.MAX_RETRIES - 1: retry_delay = random.uniform(*Config.RETRY_DELAY) if self.args.verbose: print(f"All search engines failed. Waiting {retry_delay:.2f} seconds before retry...") time.sleep(retry_delay) # If all retries failed print(f"Warning: All search attempts failed for {company} {search_type}") return "" def extract_ceo(self, html_content, company): """Extract CEO name from search results""" if self.args.dry_run: return f"CEO of {company} (dry run)" if "" in html_content: return "Not found" self.debug_log(f"Attempting to extract CEO for {company}", company, "extraction") # Parse HTML with Beautiful Soup soup = BeautifulSoup(html_content, 'html.parser') # Method 1: Look for structured data try: # Extract all text-containing elements text_elements = soup.find_all(['p', 'span', 'div', 'li']) # Create a list of text snippets for pattern matching snippets = [] for element in text_elements: text = element.get_text(strip=True) if text and len(text) > 10: # Ignore very short snippets snippets.append(text) # Define CEO pattern matches ceo_patterns = [ r"CEO\s+(is|of)\s+([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)", r"([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)\s+is\s+(?:the\s+)?(?:current\s+)?(?:CEO|Chief Executive Officer)", r"([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)\s+has been\s+(?:the\s+)?(?:CEO|Chief Executive Officer)", r"led by\s+(?:CEO|Chief Executive Officer)\s+([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)", r"led by\s+([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?),\s+(?:the\s+)?(?:CEO|Chief Executive Officer)", r"(?:CEO|Chief Executive Officer)[,]?\s+([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)", r"([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)\s+serves as\s+(?:the\s+)?(?:CEO|Chief Executive Officer)", r"current\s+(?:CEO|Chief Executive Officer)\s+(?:is\s+)?([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)" ] # Try each pattern on the snippets for snippet in snippets: for pattern in ceo_patterns: self.debug_log(f"Checking snippet with pattern: {pattern}", company, "patterns") match = re.search(pattern, snippet, re.IGNORECASE) if match: # Determine which group contains the CEO name based on pattern if pattern.startswith(r"CEO"): ceo = match.group(2) else: ceo = match.group(1) if ceo: self.debug_log(f"Extracted CEO from snippet: {ceo}", company, "extraction") return ceo # If no patterns matched, look for CEO-related content more broadly ceo_related_texts = [] for snippet in snippets: if "ceo" in snippet.lower() or "chief executive" in snippet.lower(): ceo_related_texts.append(snippet) if ceo_related_texts: # Look for a name pattern in the CEO-related content name_pattern = r"([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)" for text in ceo_related_texts: match = re.search(name_pattern, text) if match: ceo = match.group(1) self.debug_log(f"Extracted CEO from related text: {ceo}", company, "extraction") return ceo except Exception as e: self.debug_log(f"Error extracting CEO: {e}", company, "extraction") # If all extraction methods fail, return placeholder self.debug_log("Failed to extract CEO", company, "extraction") return "Not found" def extract_staff_by_title(self, html_content, company): """Extract staff member by title from search results""" if self.args.dry_run: return f"Staff member ({self.args.target_staff}) of {company} (dry run)" if "" in html_content: return "Not found" target_title = self.args.target_staff if not target_title: return "No title specified" self.debug_log(f"Attempting to extract {target_title} for {company}", company, "extraction") # Parse HTML with Beautiful Soup soup = BeautifulSoup(html_content, 'html.parser') try: # Extract all text-containing elements text_elements = soup.find_all(['p', 'span', 'div', 'li']) # Create a list of text snippets for pattern matching snippets = [] for element in text_elements: text = element.get_text(strip=True) if text and len(text) > 10: # Ignore very short snippets snippets.append(text) # Create patterns for the specified title # Normalize the title for pattern matching normalized_title = target_title.lower().replace(' ', '\\s+') # Define staff pattern matches staff_patterns = [ rf"{normalized_title}\s+(is|of)\s+([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)", rf"([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)\s+is\s+(?:the\s+)?(?:current\s+)?(?:{normalized_title})", rf"([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)\s+has been\s+(?:the\s+)?(?:{normalized_title})", rf"led by\s+(?:{normalized_title})\s+([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)", rf"led by\s+([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?),\s+(?:the\s+)?(?:{normalized_title})", rf"(?:{normalized_title})[,]?\s+([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)", rf"([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)\s+serves as\s+(?:the\s+)?(?:{normalized_title})", rf"current\s+(?:{normalized_title})\s+(?:is\s+)?([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)" ] # Try each pattern on the snippets for snippet in snippets: for pattern in staff_patterns: self.debug_log(f"Checking snippet with pattern: {pattern}", company, "patterns") match = re.search(pattern, snippet, re.IGNORECASE) if match: # Extract the name based on the pattern if len(match.groups()) > 1 and pattern.startswith(rf"{normalized_title}"): staff_name = match.group(2) else: staff_name = match.group(1) if staff_name: self.debug_log(f"Extracted {target_title} from snippet: {staff_name}", company, "extraction") return staff_name # If no patterns matched, look for title-related content more broadly title_related_texts = [] for snippet in snippets: if target_title.lower() in snippet.lower(): title_related_texts.append(snippet) if title_related_texts: # Look for a name pattern in the title-related content name_pattern = r"([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)" for text in title_related_texts: match = re.search(name_pattern, text) if match: staff_name = match.group(1) self.debug_log(f"Extracted {target_title} from related text: {staff_name}", company, "extraction") return staff_name except Exception as e: self.debug_log(f"Error extracting {target_title}: {e}", company, "extraction") # If all extraction methods fail, return placeholder self.debug_log(f"Failed to extract {target_title}", company, "extraction") return "Not found" def extract_address(self, html_content, company): """Extract headquarters address from search results""" if self.args.dry_run: return f"Address of {company} HQ (dry run)" if "" in html_content: return "Not found" self.debug_log(f"Attempting to extract headquarters address for {company}", company, "extraction") # Parse HTML with Beautiful Soup soup = BeautifulSoup(html_content, 'html.parser') try: # Extract all text-containing elements text_elements = soup.find_all(['p', 'span', 'div', 'li']) # Create a list of text snippets for pattern matching snippets = [] for element in text_elements: text = element.get_text(strip=True) if text and len(text) > 10: # Ignore very short snippets snippets.append(text) # Define address pattern matches address_patterns = [ r"located at\s+([0-9]+\s+[A-Za-z\s]+(Road|Street|Avenue|Ave|St|Blvd|Boulevard|Pkwy|Parkway)[,\s]+[A-Za-z\s]+[,\s]+[A-Za-z]+\s+[0-9-]+)", r"located at\s+([0-9]+\s+[A-Za-z\s]+(Road|Street|Avenue|Ave|St|Blvd|Boulevard|Pkwy|Parkway)[,\s]+[A-Za-z\s]+)", r"located in\s+([A-Za-z\s]+(?:,|\s+in\s+)[A-Za-z\s]+)", r"headquarters\s+(?:is|are)\s+(?:in|at)\s+([0-9]+\s+[A-Za-z\s]+(Road|Street|Avenue|Ave|St|Blvd|Boulevard|Pkwy|Parkway)[,\s]+[A-Za-z\s]+)", r"headquarters\s+(?:is|are)\s+(?:in|at)\s+([A-Za-z\s]+(?:,|\s+in\s+)[A-Za-z\s]+)", r"headquartered\s+(?:in|at)\s+([A-Za-z\s]+(?:,|\s+in\s+)[A-Za-z\s]+)", r"based\s+(?:in|at)\s+([A-Za-z\s]+(?:,|\s+in\s+)[A-Za-z\s]+)", r"address\s+(?:is|of|:)\s+([0-9]+\s+[A-Za-z\s]+(Road|Street|Avenue|Ave|St|Blvd|Boulevard|Pkwy|Parkway)[,\s]+[A-Za-z\s]+)" ] # Try each pattern on the snippets for snippet in snippets: for pattern in address_patterns: self.debug_log(f"Checking snippet with pattern: {pattern}", company, "patterns") match = re.search(pattern, snippet, re.IGNORECASE) if match: address = match.group(1).strip() if address: self.debug_log(f"Extracted address from snippet: {address}", company, "extraction") return address # If no patterns matched, look for address-related content more broadly location_related_texts = [] for snippet in snippets: if any(term in snippet.lower() for term in ["headquarters", "located", "address", "based in"]): location_related_texts.append(snippet) if location_related_texts: # Look for an address pattern in the location-related content address_pattern = r"([0-9]+\s+[A-Za-z\s]+(?:Road|Street|Avenue|Ave|St|Blvd|Boulevard|Pkwy|Parkway)[,\s]+[A-Za-z\s]+)" for text in location_related_texts: match = re.search(address_pattern, text, re.IGNORECASE) if match: address = match.group(1) self.debug_log(f"Extracted address from related text: {address}", company, "extraction") return address except Exception as e: self.debug_log(f"Error extracting address: {e}", company, "extraction") # If all extraction methods fail, return placeholder self.debug_log("Failed to extract headquarters address", company, "extraction") return "Not found" def extract_mailing_address(self, html_content, company): """Extract mailing address from search results""" if self.args.dry_run: return f"Mailing address of {company} (dry run)" if "" in html_content: return "Not found" self.debug_log(f"Attempting to extract mailing address for {company}", company, "extraction") # Parse HTML with Beautiful Soup soup = BeautifulSoup(html_content, 'html.parser') try: # Extract all text-containing elements text_elements = soup.find_all(['p', 'span', 'div', 'li']) # Create a list of text snippets for pattern matching snippets = [] for element in text_elements: text = element.get_text(strip=True) if text and len(text) > 10: # Ignore very short snippets snippets.append(text) # Define mailing address pattern matches mailing_patterns = [ r"mailing address[:\s]+([0-9]+\s+[A-Za-z\s]+(Road|Street|Avenue|Ave|St|Blvd|Boulevard|Pkwy|Parkway)[,\s]+[A-Za-z\s]+[,\s]+[A-Za-z]+\s+[0-9-]+)", r"postal address[:\s]+([0-9]+\s+[A-Za-z\s]+(Road|Street|Avenue|Ave|St|Blvd|Boulevard|Pkwy|Parkway)[,\s]+[A-Za-z\s]+[,\s]+[A-Za-z]+\s+[0-9-]+)", r"mail to[:\s]+([0-9]+\s+[A-Za-z\s]+(Road|Street|Avenue|Ave|St|Blvd|Boulevard|Pkwy|Parkway)[,\s]+[A-Za-z\s]+[,\s]+[A-Za-z]+\s+[0-9-]+)", r"P\.?O\.?\s+Box\s+([0-9]+)[,\s]+([A-Za-z\s]+[,\s]+[A-Za-z]+\s+[0-9-]+)", r"([0-9]+\s+[A-Za-z\s]+(Road|Street|Avenue|Ave|St|Blvd|Boulevard|Pkwy|Parkway)[,\s]+[A-Za-z\s]+[,\s]+[A-Za-z]+\s+[0-9-]+)" ] # Try each pattern on the snippets for snippet in snippets: for pattern in mailing_patterns: self.debug_log(f"Checking snippet with pattern: {pattern}", company, "patterns") match = re.search(pattern, snippet, re.IGNORECASE) if match: if pattern.startswith(r"P\.?O\.?"): # Handle PO Box format po_box = f"PO Box {match.group(1)}" location = match.group(2).strip() address = f"{po_box}, {location}" else: address = match.group(1).strip() if address: self.debug_log(f"Extracted mailing address from snippet: {address}", company, "extraction") return address except Exception as e: self.debug_log(f"Error extracting mailing address: {e}", company, "extraction") # If all extraction methods fail, return placeholder self.debug_log("Failed to extract mailing address", company, "extraction") return "Not found" def extract_phone(self, html_content, company): """Extract phone number from search results""" if self.args.dry_run: return f"Phone number of {company} (dry run)" if "" in html_content: return "Not found" self.debug_log(f"Attempting to extract phone number for {company}", company, "extraction") # Parse HTML with Beautiful Soup soup = BeautifulSoup(html_content, 'html.parser') try: # Extract all text-containing elements text_elements = soup.find_all(['p', 'span', 'div', 'li']) # Create a list of text snippets for pattern matching snippets = [] for element in text_elements: text = element.get_text(strip=True) if text: snippets.append(text) # Define phone pattern matches phone_patterns = [ r"phone[:\s]+(\+?[0-9][\s\-\.\(\)0-9]{8,20})", r"call[:\s]+(\+?[0-9][\s\-\.\(\)0-9]{8,20})", r"telephone[:\s]+(\+?[0-9][\s\-\.\(\)0-9]{8,20})", r"tel[:\s]+(\+?[0-9][\s\-\.\(\)0-9]{8,20})", r"contact[:\s]+(\+?[0-9][\s\-\.\(\)0-9]{8,20})", r"(?" in html_content: return "Not found" self.debug_log(f"Attempting to extract email for {company}", company, "extraction") # Parse HTML with Beautiful Soup soup = BeautifulSoup(html_content, 'html.parser') try: # Extract all text-containing elements text_elements = soup.find_all(['p', 'span', 'div', 'li', 'a']) # Create a list of text snippets for pattern matching snippets = [] for element in text_elements: text = element.get_text(strip=True) if text: snippets.append(text) # Also check for href attributes in tags if element.name == 'a' and element.has_attr('href'): href = element['href'] if href.startswith('mailto:'): snippets.append(href) # Define email pattern matches email_patterns = [ r"email[:\s]+([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})", r"e-mail[:\s]+([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})", r"mailto:([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})", r"([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})" # Generic email pattern ] # Try each pattern on the snippets for snippet in snippets: for pattern in email_patterns: self.debug_log(f"Checking snippet with pattern: {pattern}", company, "patterns") match = re.search(pattern, snippet, re.IGNORECASE) if match: email = match.group(1).strip().lower() if email: # Basic validation to avoid false positives if '.' in email.split('@')[1] and '@' in email: self.debug_log(f"Extracted email from snippet: {email}", company, "extraction") return email except Exception as e: self.debug_log(f"Error extracting email: {e}", company, "extraction") # If all extraction methods fail, return placeholder self.debug_log("Failed to extract email", company, "extraction") return "Not found" def extract_social(self, html_content, company): """Extract social media profiles from search results""" if self.args.dry_run: return f"Social media of {company} (dry run)" if "" in html_content: return "Not found" self.debug_log(f"Attempting to extract social media profiles for {company}", company, "extraction") # Parse HTML with Beautiful Soup soup = BeautifulSoup(html_content, 'html.parser') try: # Extract all text-containing elements and links text_elements = soup.find_all(['p', 'span', 'div', 'li']) link_elements = soup.find_all('a') # Create a list of text snippets and href values for pattern matching snippets = [] for element in text_elements: text = element.get_text(strip=True) if text: snippets.append(text) for link in link_elements: if link.has_attr('href'): snippets.append(link['href']) # Define social media pattern matches social_patterns = [ r"(?:https?://)?(?:www\.)?twitter\.com/([A-Za-z0-9_]+)", r"(?:https?://)?(?:www\.)?linkedin\.com/(?:company|in)/([A-Za-z0-9_\-]+)", r"(?:https?://)?(?:www\.)?facebook\.com/([A-Za-z0-9\.\-]+)", r"(?:https?://)?(?:www\.)?instagram\.com/([A-Za-z0-9_\.]+)", r"(?:https?://)?(?:www\.)?youtube\.com/(?:channel|user)/([A-Za-z0-9_\-]+)" ] social_results = [] # Try each pattern on the snippets for snippet in snippets: for pattern in social_patterns: self.debug_log(f"Checking snippet with pattern: {pattern}", company, "patterns") match = re.search(pattern, snippet, re.IGNORECASE) if match: handle = match.group(1).strip() platform = pattern.split(r'\.')[1].split(r'\.')[0] # Extract platform name from pattern if handle: social_entry = f"{platform}: {handle}" if social_entry not in social_results: social_results.append(social_entry) self.debug_log(f"Extracted social media: {social_entry}", company, "extraction") if social_results: return "; ".join(social_results) except Exception as e: self.debug_log(f"Error extracting social media: {e}", company, "extraction") # If no social media profiles found, return placeholder self.debug_log("Failed to extract social media profiles", company, "extraction") return "Not found" def extract_contact(self, html_content, company): """Extract general contact information from search results""" if self.args.dry_run: return f"Contact info of {company} (dry run)" if "" in html_content: return "Not found" # This is a combined extraction function that looks for multiple # types of contact information in one search result contact_parts = {} # Use the specialized extraction methods contact_parts["phone"] = self.extract_phone(html_content, company) contact_parts["email"] = self.extract_email(html_content, company) # Combine the results contact_info = [] for key, value in contact_parts.items(): if value != "Not found": contact_info.append(f"{key}: {value}") if contact_info: return "; ".join(contact_info) return "Not found" def debug_log(self, message, company, log_type): """Log debug information if debug mode is enabled""" if self.args.debug: clean_company = re.sub(r'[^a-zA-Z0-9_-]', '_', company) log_file = os.path.join(Config.DEBUG_DIR, log_type, f"{clean_company}.log") with open(log_file, 'a', encoding='utf-8') as f: timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") f.write(f"[{timestamp}] {message}\n") if self.args.verbose: print(f"DEBUG: {message}") elif self.args.verbose: print(f"INFO: {message}") def process_companies(self): """Process the list of companies and create CSV output""" total = len(self.companies) # Process each company for i, company in enumerate(self.companies): progress = int((i + 1) * 100 / total) print(f"Processing {i+1} of {total} ({progress}%): {company}") if not company: continue # Initialize result dictionary for this company company_result = { "company": company } # Process each selected search type for search_type in self.search_types: search_html = self.search_company(company, search_type) # Add a delay between searches if not self.args.dry_run and search_type != self.search_types[-1]: delay = random.uniform(*Config.DELAY_BETWEEN_SEARCHES) if self.args.verbose: print(f"Waiting {delay:.2f} seconds between searches...") time.sleep(delay) # Extract information based on search type if search_type == "ceo": company_result["ceo"] = self.extract_ceo(search_html, company) elif search_type == "hq": company_result["headquarters"] = self.extract_address(search_html, company) elif search_type == "phone": company_result["phone"] = self.extract_phone(search_html, company) elif search_type == "email": company_result["email"] = self.extract_email(search_html, company) elif search_type == "social": company_result["social_media"] = self.extract_social(search_html, company) elif search_type == "contact": company_result["contact_info"] = self.extract_contact(search_html, company) elif search_type == "mailing": company_result["mailing_address"] = self.extract_mailing_address(search_html, company) elif search_type == "staff": staff_title = self.args.target_staff or "CEO" company_result[f"{staff_title.lower().replace(' ', '_')}"] = self.extract_staff_by_title(search_html, company) # Add result to list self.results.append(company_result) # Add a delay between companies if not self.args.dry_run and i < total - 1: delay = random.uniform(*Config.DELAY_BETWEEN_COMPANIES) if self.args.verbose: print(f"Waiting {delay:.2f} seconds before next company...") time.sleep(delay) print(f"Completed processing {total} companies.") def save_results(self): """Save results to CSV file""" try: # Determine all fields across all results all_fields = set() for result in self.results: all_fields.update(result.keys()) # Ensure 'company' is the first field field_list = sorted(list(all_fields)) if 'company' in field_list: field_list.remove('company') field_list = ['company'] + field_list with open(self.args.output_file, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(field_list) for result in self.results: row = [] for field in field_list: row.append(result.get(field, "")) writer.writerow(row) print(f"Results saved to {self.args.output_file}") except Exception as e: print(f"Error saving results: {e}") def run(self): """Main execution method""" print(f"Enhanced Company Information Scraper v{Config.VERSION}") self.load_companies() if self.args.verbose: print(f"Using SearXNG at: {Config.SEARXNG_URL}") print(f"Mode: {self.args.mode}") if self.args.target_staff: print(f"Target staff title: {self.args.target_staff}") print(f"Debug mode: {self.args.debug}") print(f"Cache: {'enabled' if self.args.use_cache else 'disabled'}") print(f"Saving raw HTML: {self.args.save_raw}") self.process_companies() self.save_results() if self.args.save_raw: print(f"Raw HTML search results saved to {Config.RAW_DIR}/") def parse_args(): """Parse command line arguments""" parser = argparse.ArgumentParser(description='Enhanced Company Information Scraper with SearXNG') parser.add_argument('-i', '--input', dest='input_file', help='Input file with company names (one per line)') parser.add_argument('-o', '--output', dest='output_file', default=f"company_data_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv", help='Output CSV file (default: company_data_.csv)') # Scraping mode options mode_group = parser.add_argument_group('Scraping Mode') mode_group.add_argument('-m', '--mode', choices=['minimal', 'targeted', 'comprehensive'], default='minimal', help='Scraping mode: minimal (CEO, HQ only), targeted (specific data), comprehensive (all data)') mode_group.add_argument('-T', '--target-staff', dest='target_staff', help='Target specific staff title (e.g., "CTO", "CFO", "Marketing Director")') # Include/exclude data types data_group = parser.add_argument_group('Data Selection') data_group.add_argument('--include-types', dest='include_types', help='Comma-separated list of data types to include (ceo,hq,phone,email,social,contact,mailing,staff)') data_group.add_argument('--exclude-types', dest='exclude_types', help='Comma-separated list of data types to exclude') data_group.add_argument('--include-contact', dest='include_contact', action='store_true', help='Include contact information (phone, email) in targeted mode') data_group.add_argument('--include-address', dest='include_address', action='store_true', help='Include address information (HQ, mailing) in targeted mode') data_group.add_argument('--include-social', dest='include_social', action='store_true', help='Include social media information in targeted mode') # Cache and performance options cache_group = parser.add_argument_group('Cache and Performance') cache_group.add_argument('-c', '--no-cache', dest='use_cache', action='store_false', default=True, help='Disable caching of search results') cache_group.add_argument('-t', '--timeout', dest='timeout', type=int, default=Config.DEFAULT_TIMEOUT, help=f'Set request timeout in seconds (default: {Config.DEFAULT_TIMEOUT})') # Debug and logging options debug_group = parser.add_argument_group('Debug and Logging') debug_group.add_argument('-D', '--dry-run', dest='dry_run', action='store_true', default=False, help='Show what would be done without executing searches') debug_group.add_argument('-d', '--debug', dest='debug', action='store_true', default=False, help='Enable debug mode (saves extraction details)') debug_group.add_argument('-r', '--raw', dest='save_raw', action='store_true', default=False, help='Save raw HTML from searches for inspection') debug_group.add_argument('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Show verbose output during processing') # SearXNG configuration searx_group = parser.add_argument_group('SearXNG Configuration') searx_group.add_argument('-s', '--searxng-url', dest='searxng_url', default=Config.SEARXNG_URL, help=f'SearXNG instance URL (default: {Config.SEARXNG_URL})') args = parser.parse_args() # Override the SearXNG URL if provided if args.searxng_url != Config.SEARXNG_URL: Config.SEARXNG_URL = args.searxng_url return args if __name__ == "__main__": args = parse_args() scraper = EnhancedCompanyScraper(args) scraper.run()