#!/usr/bin/env python3
"""
Enhanced Company Information Scraper with SearXNG Integration
----------------------------
A Python script to collect various company information including executives,
contact details, and addresses using SearXNG as the search backend.
Enhanced features:
- Search for staff by specific titles
- Collect contact information (phone, email, social media)
- Multiple output modes (minimal, targeted, comprehensive)
- Configurable data collection targets
"""
import argparse
import csv
import json
import os
import random
import re
import sys
import time
from datetime import datetime
from urllib.parse import quote_plus, urlencode
try:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
except ImportError:
print("Required packages not found. Please install them with:")
print("pip install requests beautifulsoup4 fake-useragent")
sys.exit(1)
# Configuration
class Config:
VERSION = "2.0.0"
DEFAULT_TIMEOUT = 20
CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".cache")
DEBUG_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), ".debug")
RAW_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "raw_scrapes")
# SearXNG configuration
SEARXNG_URL = "http://localhost:8888/"
# Search engines to use with SearXNG
SEARCH_ENGINES = [
"google",
"duckduckgo",
"bing"
]
# Search delay ranges (min, max) in seconds
DELAY_BETWEEN_SEARCHES = (1, 3) # Can be lower with SearXNG
DELAY_BETWEEN_COMPANIES = (2, 5) # Can be lower with SearXNG
DELAY_BEFORE_SEARCH = (0.5, 1.5) # Can be lower with SearXNG
# Retry configuration
MAX_RETRIES = 3
RETRY_DELAY = (2, 5) # Can be lower with SearXNG
# Available search types
SEARCH_TYPES = {
"ceo": "CEO information",
"hq": "Headquarters address",
"phone": "Phone numbers",
"email": "Email addresses",
"social": "Social media profiles",
"staff": "Staff members by title",
"contact": "General contact information",
"mailing": "Mailing address"
}
# Minimal mode search types
MINIMAL_SEARCH_TYPES = ["ceo", "hq"]
# Default comprehensive search types (everything)
COMPREHENSIVE_SEARCH_TYPES = list(SEARCH_TYPES.keys())
class EnhancedCompanyScraper:
def __init__(self, args):
self.args = args
self.companies = []
self.results = []
self.session = requests.Session()
# Determine which search types to use based on mode
self.search_types = self.determine_search_types()
self.setup_directories()
# Check if SearXNG is running
if not self.check_searxng():
print(f"Error: SearXNG not available at {Config.SEARXNG_URL}")
print("Please make sure SearXNG is running before using this script.")
print("You can start it with: docker-compose up -d")
sys.exit(1)
# Use fake-useragent to rotate user agents
try:
self.ua = UserAgent()
except:
# Fallback if fake-useragent fails
self.ua = None
print("Warning: fake-useragent failed to initialize. Using default user agent.")
def determine_search_types(self):
"""Determine which search types to use based on mode and args"""
search_types = []
# Start with default search types
if self.args.mode == "minimal":
search_types = Config.MINIMAL_SEARCH_TYPES.copy()
elif self.args.mode == "comprehensive":
search_types = Config.COMPREHENSIVE_SEARCH_TYPES.copy()
elif self.args.mode == "targeted":
# For targeted mode, use only what was specified
if self.args.target_staff:
search_types.append("staff")
else:
# If no staff title specified, default to CEO
search_types.append("ceo")
# Add any explicitly requested types
if self.args.include_contact:
search_types.extend(["phone", "email"])
if self.args.include_address:
search_types.extend(["hq", "mailing"])
if self.args.include_social:
search_types.append("social")
# If nothing explicitly included, add headquarters
if len(search_types) == 1: # Only staff/ceo
search_types.append("hq")
# Override with explicit includes/excludes
if self.args.include_types:
for type_name in self.args.include_types.split(','):
type_name = type_name.strip()
if type_name in Config.SEARCH_TYPES and type_name not in search_types:
search_types.append(type_name)
if self.args.exclude_types:
for type_name in self.args.exclude_types.split(','):
type_name = type_name.strip()
if type_name in search_types:
search_types.remove(type_name)
# Log selected search types
if self.args.verbose:
print(f"Selected search types: {', '.join(search_types)}")
return search_types
def check_searxng(self):
"""Check if SearXNG is running and available"""
if self.args.dry_run:
return True
try:
response = requests.get(Config.SEARXNG_URL, timeout=5)
return response.status_code == 200
except:
return False
def setup_directories(self):
"""Create necessary directories for caching and debugging"""
# Create cache directories for all search types
if self.args.use_cache:
for search_type in Config.SEARCH_TYPES.keys():
os.makedirs(os.path.join(Config.CACHE_DIR, search_type), exist_ok=True)
if self.args.debug:
os.makedirs(Config.DEBUG_DIR, exist_ok=True)
os.makedirs(os.path.join(Config.DEBUG_DIR, "extraction"), exist_ok=True)
os.makedirs(os.path.join(Config.DEBUG_DIR, "patterns"), exist_ok=True)
if self.args.save_raw:
for search_type in Config.SEARCH_TYPES.keys():
os.makedirs(os.path.join(Config.RAW_DIR, search_type), exist_ok=True)
def load_companies(self):
"""Load companies from file or stdin"""
if self.args.input_file:
try:
with open(self.args.input_file, 'r') as f:
for line in f:
company = line.strip()
if company:
self.companies.append(company)
except Exception as e:
print(f"Error loading companies from file: {e}")
sys.exit(1)
else:
print("Enter company names (one per line), press Ctrl+D when finished:")
for line in sys.stdin:
company = line.strip()
if company:
self.companies.append(company)
if not self.companies:
print("No companies provided!")
sys.exit(1)
print(f"Loaded {len(self.companies)} companies")
def get_random_user_agent(self):
"""Get a random user agent"""
if self.ua:
return self.ua.random
return "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
def get_searxng_url(self, query, search_type, engine):
"""Get SearXNG search URL for the given engine and search type"""
query = quote_plus(query)
search_terms = ""
if search_type == "ceo":
search_terms = "CEO who is the chief executive"
elif search_type == "hq":
search_terms = "headquarters address location where is"
elif search_type == "phone":
search_terms = "phone number contact"
elif search_type == "email":
search_terms = "email address contact"
elif search_type == "social":
search_terms = "social media profiles twitter linkedin facebook"
elif search_type == "contact":
search_terms = "contact information phone email"
elif search_type == "mailing":
search_terms = "mailing address postal"
elif search_type == "staff":
# For staff, include the target title in the search
staff_title = self.args.target_staff or "executive team"
search_terms = f"{staff_title} who is"
# Build the full query
full_query = f"{query} {search_terms}"
# Prepare parameters for SearXNG
params = {
'q': full_query,
'engines': engine,
'format': 'html',
'language': 'en-US'
}
# Build the URL
url = f"{Config.SEARXNG_URL.rstrip('/')}/?{urlencode(params)}"
return url
def search_company(self, company, search_type):
"""Search for company information with specific search type"""
clean_company = re.sub(r'[^a-zA-Z0-9_-]', '+', company)
cache_file = os.path.join(Config.CACHE_DIR, search_type, f"{clean_company}.html")
# Check cache first if enabled
if self.args.use_cache and os.path.exists(cache_file):
self.debug_log(f"Using cached data for {search_type} search", company, "extraction")
with open(cache_file, 'r', encoding='utf-8') as f:
return f.read()
# Try each search engine until one succeeds
for retry in range(Config.MAX_RETRIES):
for engine in Config.SEARCH_ENGINES:
if self.args.verbose:
print(f"Searching for {company} {search_type} using SearXNG with {engine} (attempt {retry+1})")
# Random delay before search
delay = random.uniform(*Config.DELAY_BEFORE_SEARCH)
if self.args.verbose:
print(f"Waiting {delay:.2f} seconds before search...")
time.sleep(delay)
# Get the search URL
url = self.get_searxng_url(company, search_type, engine)
if self.args.dry_run:
self.debug_log(f"Would search: {url}", company, "extraction")
return ""
# Prepare headers with random user agent
headers = {
"User-Agent": self.get_random_user_agent(),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1"
}
try:
response = self.session.get(
url,
headers=headers,
timeout=self.args.timeout
)
# Check if the response is valid
if response.status_code != 200:
if self.args.verbose:
print(f"Got status code {response.status_code} from SearXNG with {engine}")
continue
# Get the HTML content
html_content = response.text
# Save raw HTML if requested
if self.args.save_raw:
raw_file = os.path.join(Config.RAW_DIR, search_type, f"{clean_company}_{engine}.html")
with open(raw_file, 'w', encoding='utf-8') as f:
f.write(html_content)
# Save to cache if enabled
if self.args.use_cache:
with open(cache_file, 'w', encoding='utf-8') as f:
f.write(html_content)
return html_content
except Exception as e:
if self.args.verbose:
print(f"Error searching with SearXNG/{engine}: {e}")
continue
# If we've tried all engines and none worked, wait before retry
if retry < Config.MAX_RETRIES - 1:
retry_delay = random.uniform(*Config.RETRY_DELAY)
if self.args.verbose:
print(f"All search engines failed. Waiting {retry_delay:.2f} seconds before retry...")
time.sleep(retry_delay)
# If all retries failed
print(f"Warning: All search attempts failed for {company} {search_type}")
return ""
def extract_ceo(self, html_content, company):
"""Extract CEO name from search results"""
if self.args.dry_run:
return f"CEO of {company} (dry run)"
if "" in html_content:
return "Not found"
self.debug_log(f"Attempting to extract CEO for {company}", company, "extraction")
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html_content, 'html.parser')
# Method 1: Look for structured data
try:
# Extract all text-containing elements
text_elements = soup.find_all(['p', 'span', 'div', 'li'])
# Create a list of text snippets for pattern matching
snippets = []
for element in text_elements:
text = element.get_text(strip=True)
if text and len(text) > 10: # Ignore very short snippets
snippets.append(text)
# Define CEO pattern matches
ceo_patterns = [
r"CEO\s+(is|of)\s+([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)",
r"([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)\s+is\s+(?:the\s+)?(?:current\s+)?(?:CEO|Chief Executive Officer)",
r"([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)\s+has been\s+(?:the\s+)?(?:CEO|Chief Executive Officer)",
r"led by\s+(?:CEO|Chief Executive Officer)\s+([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)",
r"led by\s+([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?),\s+(?:the\s+)?(?:CEO|Chief Executive Officer)",
r"(?:CEO|Chief Executive Officer)[,]?\s+([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)",
r"([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)\s+serves as\s+(?:the\s+)?(?:CEO|Chief Executive Officer)",
r"current\s+(?:CEO|Chief Executive Officer)\s+(?:is\s+)?([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)"
]
# Try each pattern on the snippets
for snippet in snippets:
for pattern in ceo_patterns:
self.debug_log(f"Checking snippet with pattern: {pattern}", company, "patterns")
match = re.search(pattern, snippet, re.IGNORECASE)
if match:
# Determine which group contains the CEO name based on pattern
if pattern.startswith(r"CEO"):
ceo = match.group(2)
else:
ceo = match.group(1)
if ceo:
self.debug_log(f"Extracted CEO from snippet: {ceo}", company, "extraction")
return ceo
# If no patterns matched, look for CEO-related content more broadly
ceo_related_texts = []
for snippet in snippets:
if "ceo" in snippet.lower() or "chief executive" in snippet.lower():
ceo_related_texts.append(snippet)
if ceo_related_texts:
# Look for a name pattern in the CEO-related content
name_pattern = r"([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)"
for text in ceo_related_texts:
match = re.search(name_pattern, text)
if match:
ceo = match.group(1)
self.debug_log(f"Extracted CEO from related text: {ceo}", company, "extraction")
return ceo
except Exception as e:
self.debug_log(f"Error extracting CEO: {e}", company, "extraction")
# If all extraction methods fail, return placeholder
self.debug_log("Failed to extract CEO", company, "extraction")
return "Not found"
def extract_staff_by_title(self, html_content, company):
"""Extract staff member by title from search results"""
if self.args.dry_run:
return f"Staff member ({self.args.target_staff}) of {company} (dry run)"
if "" in html_content:
return "Not found"
target_title = self.args.target_staff
if not target_title:
return "No title specified"
self.debug_log(f"Attempting to extract {target_title} for {company}", company, "extraction")
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html_content, 'html.parser')
try:
# Extract all text-containing elements
text_elements = soup.find_all(['p', 'span', 'div', 'li'])
# Create a list of text snippets for pattern matching
snippets = []
for element in text_elements:
text = element.get_text(strip=True)
if text and len(text) > 10: # Ignore very short snippets
snippets.append(text)
# Create patterns for the specified title
# Normalize the title for pattern matching
normalized_title = target_title.lower().replace(' ', '\\s+')
# Define staff pattern matches
staff_patterns = [
rf"{normalized_title}\s+(is|of)\s+([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)",
rf"([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)\s+is\s+(?:the\s+)?(?:current\s+)?(?:{normalized_title})",
rf"([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)\s+has been\s+(?:the\s+)?(?:{normalized_title})",
rf"led by\s+(?:{normalized_title})\s+([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)",
rf"led by\s+([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?),\s+(?:the\s+)?(?:{normalized_title})",
rf"(?:{normalized_title})[,]?\s+([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)",
rf"([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)\s+serves as\s+(?:the\s+)?(?:{normalized_title})",
rf"current\s+(?:{normalized_title})\s+(?:is\s+)?([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)"
]
# Try each pattern on the snippets
for snippet in snippets:
for pattern in staff_patterns:
self.debug_log(f"Checking snippet with pattern: {pattern}", company, "patterns")
match = re.search(pattern, snippet, re.IGNORECASE)
if match:
# Extract the name based on the pattern
if len(match.groups()) > 1 and pattern.startswith(rf"{normalized_title}"):
staff_name = match.group(2)
else:
staff_name = match.group(1)
if staff_name:
self.debug_log(f"Extracted {target_title} from snippet: {staff_name}", company, "extraction")
return staff_name
# If no patterns matched, look for title-related content more broadly
title_related_texts = []
for snippet in snippets:
if target_title.lower() in snippet.lower():
title_related_texts.append(snippet)
if title_related_texts:
# Look for a name pattern in the title-related content
name_pattern = r"([A-Z][a-z]+\s+[A-Z][a-z]+(?:[ -][A-Z][a-z]+)?)"
for text in title_related_texts:
match = re.search(name_pattern, text)
if match:
staff_name = match.group(1)
self.debug_log(f"Extracted {target_title} from related text: {staff_name}", company, "extraction")
return staff_name
except Exception as e:
self.debug_log(f"Error extracting {target_title}: {e}", company, "extraction")
# If all extraction methods fail, return placeholder
self.debug_log(f"Failed to extract {target_title}", company, "extraction")
return "Not found"
def extract_address(self, html_content, company):
"""Extract headquarters address from search results"""
if self.args.dry_run:
return f"Address of {company} HQ (dry run)"
if "" in html_content:
return "Not found"
self.debug_log(f"Attempting to extract headquarters address for {company}", company, "extraction")
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html_content, 'html.parser')
try:
# Extract all text-containing elements
text_elements = soup.find_all(['p', 'span', 'div', 'li'])
# Create a list of text snippets for pattern matching
snippets = []
for element in text_elements:
text = element.get_text(strip=True)
if text and len(text) > 10: # Ignore very short snippets
snippets.append(text)
# Define address pattern matches
address_patterns = [
r"located at\s+([0-9]+\s+[A-Za-z\s]+(Road|Street|Avenue|Ave|St|Blvd|Boulevard|Pkwy|Parkway)[,\s]+[A-Za-z\s]+[,\s]+[A-Za-z]+\s+[0-9-]+)",
r"located at\s+([0-9]+\s+[A-Za-z\s]+(Road|Street|Avenue|Ave|St|Blvd|Boulevard|Pkwy|Parkway)[,\s]+[A-Za-z\s]+)",
r"located in\s+([A-Za-z\s]+(?:,|\s+in\s+)[A-Za-z\s]+)",
r"headquarters\s+(?:is|are)\s+(?:in|at)\s+([0-9]+\s+[A-Za-z\s]+(Road|Street|Avenue|Ave|St|Blvd|Boulevard|Pkwy|Parkway)[,\s]+[A-Za-z\s]+)",
r"headquarters\s+(?:is|are)\s+(?:in|at)\s+([A-Za-z\s]+(?:,|\s+in\s+)[A-Za-z\s]+)",
r"headquartered\s+(?:in|at)\s+([A-Za-z\s]+(?:,|\s+in\s+)[A-Za-z\s]+)",
r"based\s+(?:in|at)\s+([A-Za-z\s]+(?:,|\s+in\s+)[A-Za-z\s]+)",
r"address\s+(?:is|of|:)\s+([0-9]+\s+[A-Za-z\s]+(Road|Street|Avenue|Ave|St|Blvd|Boulevard|Pkwy|Parkway)[,\s]+[A-Za-z\s]+)"
]
# Try each pattern on the snippets
for snippet in snippets:
for pattern in address_patterns:
self.debug_log(f"Checking snippet with pattern: {pattern}", company, "patterns")
match = re.search(pattern, snippet, re.IGNORECASE)
if match:
address = match.group(1).strip()
if address:
self.debug_log(f"Extracted address from snippet: {address}", company, "extraction")
return address
# If no patterns matched, look for address-related content more broadly
location_related_texts = []
for snippet in snippets:
if any(term in snippet.lower() for term in ["headquarters", "located", "address", "based in"]):
location_related_texts.append(snippet)
if location_related_texts:
# Look for an address pattern in the location-related content
address_pattern = r"([0-9]+\s+[A-Za-z\s]+(?:Road|Street|Avenue|Ave|St|Blvd|Boulevard|Pkwy|Parkway)[,\s]+[A-Za-z\s]+)"
for text in location_related_texts:
match = re.search(address_pattern, text, re.IGNORECASE)
if match:
address = match.group(1)
self.debug_log(f"Extracted address from related text: {address}", company, "extraction")
return address
except Exception as e:
self.debug_log(f"Error extracting address: {e}", company, "extraction")
# If all extraction methods fail, return placeholder
self.debug_log("Failed to extract headquarters address", company, "extraction")
return "Not found"
def extract_mailing_address(self, html_content, company):
"""Extract mailing address from search results"""
if self.args.dry_run:
return f"Mailing address of {company} (dry run)"
if "" in html_content:
return "Not found"
self.debug_log(f"Attempting to extract mailing address for {company}", company, "extraction")
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html_content, 'html.parser')
try:
# Extract all text-containing elements
text_elements = soup.find_all(['p', 'span', 'div', 'li'])
# Create a list of text snippets for pattern matching
snippets = []
for element in text_elements:
text = element.get_text(strip=True)
if text and len(text) > 10: # Ignore very short snippets
snippets.append(text)
# Define mailing address pattern matches
mailing_patterns = [
r"mailing address[:\s]+([0-9]+\s+[A-Za-z\s]+(Road|Street|Avenue|Ave|St|Blvd|Boulevard|Pkwy|Parkway)[,\s]+[A-Za-z\s]+[,\s]+[A-Za-z]+\s+[0-9-]+)",
r"postal address[:\s]+([0-9]+\s+[A-Za-z\s]+(Road|Street|Avenue|Ave|St|Blvd|Boulevard|Pkwy|Parkway)[,\s]+[A-Za-z\s]+[,\s]+[A-Za-z]+\s+[0-9-]+)",
r"mail to[:\s]+([0-9]+\s+[A-Za-z\s]+(Road|Street|Avenue|Ave|St|Blvd|Boulevard|Pkwy|Parkway)[,\s]+[A-Za-z\s]+[,\s]+[A-Za-z]+\s+[0-9-]+)",
r"P\.?O\.?\s+Box\s+([0-9]+)[,\s]+([A-Za-z\s]+[,\s]+[A-Za-z]+\s+[0-9-]+)",
r"([0-9]+\s+[A-Za-z\s]+(Road|Street|Avenue|Ave|St|Blvd|Boulevard|Pkwy|Parkway)[,\s]+[A-Za-z\s]+[,\s]+[A-Za-z]+\s+[0-9-]+)"
]
# Try each pattern on the snippets
for snippet in snippets:
for pattern in mailing_patterns:
self.debug_log(f"Checking snippet with pattern: {pattern}", company, "patterns")
match = re.search(pattern, snippet, re.IGNORECASE)
if match:
if pattern.startswith(r"P\.?O\.?"):
# Handle PO Box format
po_box = f"PO Box {match.group(1)}"
location = match.group(2).strip()
address = f"{po_box}, {location}"
else:
address = match.group(1).strip()
if address:
self.debug_log(f"Extracted mailing address from snippet: {address}", company, "extraction")
return address
except Exception as e:
self.debug_log(f"Error extracting mailing address: {e}", company, "extraction")
# If all extraction methods fail, return placeholder
self.debug_log("Failed to extract mailing address", company, "extraction")
return "Not found"
def extract_phone(self, html_content, company):
"""Extract phone number from search results"""
if self.args.dry_run:
return f"Phone number of {company} (dry run)"
if "" in html_content:
return "Not found"
self.debug_log(f"Attempting to extract phone number for {company}", company, "extraction")
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html_content, 'html.parser')
try:
# Extract all text-containing elements
text_elements = soup.find_all(['p', 'span', 'div', 'li'])
# Create a list of text snippets for pattern matching
snippets = []
for element in text_elements:
text = element.get_text(strip=True)
if text:
snippets.append(text)
# Define phone pattern matches
phone_patterns = [
r"phone[:\s]+(\+?[0-9][\s\-\.\(\)0-9]{8,20})",
r"call[:\s]+(\+?[0-9][\s\-\.\(\)0-9]{8,20})",
r"telephone[:\s]+(\+?[0-9][\s\-\.\(\)0-9]{8,20})",
r"tel[:\s]+(\+?[0-9][\s\-\.\(\)0-9]{8,20})",
r"contact[:\s]+(\+?[0-9][\s\-\.\(\)0-9]{8,20})",
r"(?" in html_content:
return "Not found"
self.debug_log(f"Attempting to extract email for {company}", company, "extraction")
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html_content, 'html.parser')
try:
# Extract all text-containing elements
text_elements = soup.find_all(['p', 'span', 'div', 'li', 'a'])
# Create a list of text snippets for pattern matching
snippets = []
for element in text_elements:
text = element.get_text(strip=True)
if text:
snippets.append(text)
# Also check for href attributes in tags
if element.name == 'a' and element.has_attr('href'):
href = element['href']
if href.startswith('mailto:'):
snippets.append(href)
# Define email pattern matches
email_patterns = [
r"email[:\s]+([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})",
r"e-mail[:\s]+([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})",
r"mailto:([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})",
r"([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})" # Generic email pattern
]
# Try each pattern on the snippets
for snippet in snippets:
for pattern in email_patterns:
self.debug_log(f"Checking snippet with pattern: {pattern}", company, "patterns")
match = re.search(pattern, snippet, re.IGNORECASE)
if match:
email = match.group(1).strip().lower()
if email:
# Basic validation to avoid false positives
if '.' in email.split('@')[1] and '@' in email:
self.debug_log(f"Extracted email from snippet: {email}", company, "extraction")
return email
except Exception as e:
self.debug_log(f"Error extracting email: {e}", company, "extraction")
# If all extraction methods fail, return placeholder
self.debug_log("Failed to extract email", company, "extraction")
return "Not found"
def extract_social(self, html_content, company):
"""Extract social media profiles from search results"""
if self.args.dry_run:
return f"Social media of {company} (dry run)"
if "" in html_content:
return "Not found"
self.debug_log(f"Attempting to extract social media profiles for {company}", company, "extraction")
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(html_content, 'html.parser')
try:
# Extract all text-containing elements and links
text_elements = soup.find_all(['p', 'span', 'div', 'li'])
link_elements = soup.find_all('a')
# Create a list of text snippets and href values for pattern matching
snippets = []
for element in text_elements:
text = element.get_text(strip=True)
if text:
snippets.append(text)
for link in link_elements:
if link.has_attr('href'):
snippets.append(link['href'])
# Define social media pattern matches
social_patterns = [
r"(?:https?://)?(?:www\.)?twitter\.com/([A-Za-z0-9_]+)",
r"(?:https?://)?(?:www\.)?linkedin\.com/(?:company|in)/([A-Za-z0-9_\-]+)",
r"(?:https?://)?(?:www\.)?facebook\.com/([A-Za-z0-9\.\-]+)",
r"(?:https?://)?(?:www\.)?instagram\.com/([A-Za-z0-9_\.]+)",
r"(?:https?://)?(?:www\.)?youtube\.com/(?:channel|user)/([A-Za-z0-9_\-]+)"
]
social_results = []
# Try each pattern on the snippets
for snippet in snippets:
for pattern in social_patterns:
self.debug_log(f"Checking snippet with pattern: {pattern}", company, "patterns")
match = re.search(pattern, snippet, re.IGNORECASE)
if match:
handle = match.group(1).strip()
platform = pattern.split(r'\.')[1].split(r'\.')[0] # Extract platform name from pattern
if handle:
social_entry = f"{platform}: {handle}"
if social_entry not in social_results:
social_results.append(social_entry)
self.debug_log(f"Extracted social media: {social_entry}", company, "extraction")
if social_results:
return "; ".join(social_results)
except Exception as e:
self.debug_log(f"Error extracting social media: {e}", company, "extraction")
# If no social media profiles found, return placeholder
self.debug_log("Failed to extract social media profiles", company, "extraction")
return "Not found"
def extract_contact(self, html_content, company):
"""Extract general contact information from search results"""
if self.args.dry_run:
return f"Contact info of {company} (dry run)"
if "" in html_content:
return "Not found"
# This is a combined extraction function that looks for multiple
# types of contact information in one search result
contact_parts = {}
# Use the specialized extraction methods
contact_parts["phone"] = self.extract_phone(html_content, company)
contact_parts["email"] = self.extract_email(html_content, company)
# Combine the results
contact_info = []
for key, value in contact_parts.items():
if value != "Not found":
contact_info.append(f"{key}: {value}")
if contact_info:
return "; ".join(contact_info)
return "Not found"
def debug_log(self, message, company, log_type):
"""Log debug information if debug mode is enabled"""
if self.args.debug:
clean_company = re.sub(r'[^a-zA-Z0-9_-]', '_', company)
log_file = os.path.join(Config.DEBUG_DIR, log_type, f"{clean_company}.log")
with open(log_file, 'a', encoding='utf-8') as f:
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
f.write(f"[{timestamp}] {message}\n")
if self.args.verbose:
print(f"DEBUG: {message}")
elif self.args.verbose:
print(f"INFO: {message}")
def process_companies(self):
"""Process the list of companies and create CSV output"""
total = len(self.companies)
# Process each company
for i, company in enumerate(self.companies):
progress = int((i + 1) * 100 / total)
print(f"Processing {i+1} of {total} ({progress}%): {company}")
if not company:
continue
# Initialize result dictionary for this company
company_result = {
"company": company
}
# Process each selected search type
for search_type in self.search_types:
search_html = self.search_company(company, search_type)
# Add a delay between searches
if not self.args.dry_run and search_type != self.search_types[-1]:
delay = random.uniform(*Config.DELAY_BETWEEN_SEARCHES)
if self.args.verbose:
print(f"Waiting {delay:.2f} seconds between searches...")
time.sleep(delay)
# Extract information based on search type
if search_type == "ceo":
company_result["ceo"] = self.extract_ceo(search_html, company)
elif search_type == "hq":
company_result["headquarters"] = self.extract_address(search_html, company)
elif search_type == "phone":
company_result["phone"] = self.extract_phone(search_html, company)
elif search_type == "email":
company_result["email"] = self.extract_email(search_html, company)
elif search_type == "social":
company_result["social_media"] = self.extract_social(search_html, company)
elif search_type == "contact":
company_result["contact_info"] = self.extract_contact(search_html, company)
elif search_type == "mailing":
company_result["mailing_address"] = self.extract_mailing_address(search_html, company)
elif search_type == "staff":
staff_title = self.args.target_staff or "CEO"
company_result[f"{staff_title.lower().replace(' ', '_')}"] = self.extract_staff_by_title(search_html, company)
# Add result to list
self.results.append(company_result)
# Add a delay between companies
if not self.args.dry_run and i < total - 1:
delay = random.uniform(*Config.DELAY_BETWEEN_COMPANIES)
if self.args.verbose:
print(f"Waiting {delay:.2f} seconds before next company...")
time.sleep(delay)
print(f"Completed processing {total} companies.")
def save_results(self):
"""Save results to CSV file"""
try:
# Determine all fields across all results
all_fields = set()
for result in self.results:
all_fields.update(result.keys())
# Ensure 'company' is the first field
field_list = sorted(list(all_fields))
if 'company' in field_list:
field_list.remove('company')
field_list = ['company'] + field_list
with open(self.args.output_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(field_list)
for result in self.results:
row = []
for field in field_list:
row.append(result.get(field, ""))
writer.writerow(row)
print(f"Results saved to {self.args.output_file}")
except Exception as e:
print(f"Error saving results: {e}")
def run(self):
"""Main execution method"""
print(f"Enhanced Company Information Scraper v{Config.VERSION}")
self.load_companies()
if self.args.verbose:
print(f"Using SearXNG at: {Config.SEARXNG_URL}")
print(f"Mode: {self.args.mode}")
if self.args.target_staff:
print(f"Target staff title: {self.args.target_staff}")
print(f"Debug mode: {self.args.debug}")
print(f"Cache: {'enabled' if self.args.use_cache else 'disabled'}")
print(f"Saving raw HTML: {self.args.save_raw}")
self.process_companies()
self.save_results()
if self.args.save_raw:
print(f"Raw HTML search results saved to {Config.RAW_DIR}/")
def parse_args():
"""Parse command line arguments"""
parser = argparse.ArgumentParser(description='Enhanced Company Information Scraper with SearXNG')
parser.add_argument('-i', '--input', dest='input_file',
help='Input file with company names (one per line)')
parser.add_argument('-o', '--output', dest='output_file',
default=f"company_data_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv",
help='Output CSV file (default: company_data_.csv)')
# Scraping mode options
mode_group = parser.add_argument_group('Scraping Mode')
mode_group.add_argument('-m', '--mode', choices=['minimal', 'targeted', 'comprehensive'],
default='minimal',
help='Scraping mode: minimal (CEO, HQ only), targeted (specific data), comprehensive (all data)')
mode_group.add_argument('-T', '--target-staff', dest='target_staff',
help='Target specific staff title (e.g., "CTO", "CFO", "Marketing Director")')
# Include/exclude data types
data_group = parser.add_argument_group('Data Selection')
data_group.add_argument('--include-types', dest='include_types',
help='Comma-separated list of data types to include (ceo,hq,phone,email,social,contact,mailing,staff)')
data_group.add_argument('--exclude-types', dest='exclude_types',
help='Comma-separated list of data types to exclude')
data_group.add_argument('--include-contact', dest='include_contact', action='store_true',
help='Include contact information (phone, email) in targeted mode')
data_group.add_argument('--include-address', dest='include_address', action='store_true',
help='Include address information (HQ, mailing) in targeted mode')
data_group.add_argument('--include-social', dest='include_social', action='store_true',
help='Include social media information in targeted mode')
# Cache and performance options
cache_group = parser.add_argument_group('Cache and Performance')
cache_group.add_argument('-c', '--no-cache', dest='use_cache',
action='store_false', default=True,
help='Disable caching of search results')
cache_group.add_argument('-t', '--timeout', dest='timeout',
type=int, default=Config.DEFAULT_TIMEOUT,
help=f'Set request timeout in seconds (default: {Config.DEFAULT_TIMEOUT})')
# Debug and logging options
debug_group = parser.add_argument_group('Debug and Logging')
debug_group.add_argument('-D', '--dry-run', dest='dry_run',
action='store_true', default=False,
help='Show what would be done without executing searches')
debug_group.add_argument('-d', '--debug', dest='debug',
action='store_true', default=False,
help='Enable debug mode (saves extraction details)')
debug_group.add_argument('-r', '--raw', dest='save_raw',
action='store_true', default=False,
help='Save raw HTML from searches for inspection')
debug_group.add_argument('-v', '--verbose', dest='verbose',
action='store_true', default=False,
help='Show verbose output during processing')
# SearXNG configuration
searx_group = parser.add_argument_group('SearXNG Configuration')
searx_group.add_argument('-s', '--searxng-url', dest='searxng_url',
default=Config.SEARXNG_URL,
help=f'SearXNG instance URL (default: {Config.SEARXNG_URL})')
args = parser.parse_args()
# Override the SearXNG URL if provided
if args.searxng_url != Config.SEARXNG_URL:
Config.SEARXNG_URL = args.searxng_url
return args
if __name__ == "__main__":
args = parse_args()
scraper = EnhancedCompanyScraper(args)
scraper.run()