Skip to content

Web Scraping with BeautifulSoup: A Complete Guide

Web Scraping with BeautifulSoup: A Complete Guide

Web scraping is a powerful technique for extracting data from websites. In this guide, we’ll explore how to use BeautifulSoup to scrape web data effectively.

Getting Started with BeautifulSoup

First, let’s set up our environment and import the necessary libraries:

# @filename: Dockerfile

from bs4 import BeautifulSoup

from typing import List, Dict, Optional

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

Basic Web Scraping

Making HTTP Requests

# @filename: utils.py
def fetch_page(url: str) -> Optional[str]:
    """Fetch webpage content with error handling."""
    try:
        # Add headers to mimic browser request
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        logger.error(f"Error fetching {url}: {e}")
        return None

# Example usage
url = 'https://example.com'
html_content = fetch_page(url)
if html_content:
    soup = BeautifulSoup(html_content, 'html.parser')

Parsing HTML

# @filename: main.py
# Find elements by tag
paragraphs = soup.find_all('p')
links = soup.find_all('a')

# Find elements by class
elements = soup.find_all(class_='my-class')

# Find elements by ID
element = soup.find(id='my-id')

# Find elements by CSS selector
elements = soup.select('.class-name')
elements = soup.select('#id-name')
elements = soup.select('div.class-name')

Advanced Scraping Techniques

# @filename: main.py
# Navigate through elements
parent = element.parent
children = element.children
siblings = element.next_siblings

# Find nested elements
nested = soup.find('div').find('p').find('span')

# Search with multiple conditions
elements = soup.find_all(['p', 'div'], class_='content')

# Regular expression search

elements = soup.find_all(text=re.compile(r'pattern'))

Extracting Data

# @filename: main.py
# Get text content
text = element.text.strip()

# Get attributes
href = element.get('href')
src = element['src']

# Get all text recursively
all_text = soup.get_text(separator=' ', strip=True)

Project: E-commerce Product Scraper

Let’s build a complete scraper for e-commerce products:

# @filename: Dockerfile

from bs4 import BeautifulSoup

from typing import List, Dict, Optional
from datetime import datetime
from urllib.parse import urljoin

class ProductScraper:
    def __init__(self, base_url: str, output_dir: str = 'data'):
        """Initialize the ProductScraper."""
        self.base_url = base_url
        self.output_dir = output_dir
        self.session = requests.Session()
        self.products: List[Dict] = []

        # Create output directory
        os.makedirs(output_dir, exist_ok=True)

        # Configure logging
        self._setup_logging()

    def _setup_logging(self):
        """Set up logging configuration."""
        log_file = os.path.join(self.output_dir, 'scraper.log')
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(log_file),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)

    def _make_request(self, url: str) -> Optional[str]:
        """Make HTTP request with retry mechanism."""
        max_retries = 3
        retry_delay = 1

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
        }

        for attempt in range(max_retries):
            try:
                response = self.session.get(url, headers=headers, timeout=10)
                response.raise_for_status()

                # Add delay between requests
                time.sleep(random.uniform(1, 3))

                return response.text
            except requests.RequestException as e:
                self.logger.error(f"Attempt {attempt + 1} failed for {url}: {e}")
                if attempt < max_retries - 1:
                    time.sleep(retry_delay * (attempt + 1))
                continue

        return None

    def _parse_product_page(self, url: str, html: str) -> Optional[Dict]:
        """Parse product details from a product page."""
        try:
            soup = BeautifulSoup(html, 'html.parser')

            # Extract product details (customize based on website structure)
            product = {
                'url': url,
                'name': self._extract_text(soup, '.product-name'),
                'price': self._extract_price(soup, '.product-price'),
                'description': self._extract_text(soup, '.product-description'),
                'specifications': self._extract_specifications(soup),
                'images': self._extract_images(soup),
                'scraped_at': datetime.now().isoformat()
            }

            return product
        except Exception as e:
            self.logger.error(f"Error parsing product page {url}: {e}")
            return None

    def _extract_text(self, soup: BeautifulSoup, selector: str) -> str:
        """Extract text from an element."""
        element = soup.select_one(selector)
        return element.text.strip() if element else ''

    def _extract_price(self, soup: BeautifulSoup, selector: str) -> Optional[float]:
        """Extract and parse price."""
        price_text = self._extract_text(soup, selector)
        try:
            # Remove currency symbol and convert to float
            price = float(''.join(filter(str.isdigit, price_text)))
            return price
        except ValueError:
            return None

    def _extract_specifications(self, soup: BeautifulSoup) -> Dict:
        """Extract product specifications."""
        specs = {}
        # Customize based on website structure
        spec_table = soup.select_one('.specifications-table')
        if spec_table:
            for row in spec_table.select('tr'):
                cols = row.select('td')
                if len(cols) >= 2:
                    key = cols[0].text.strip()
                    value = cols[1].text.strip()
                    specs[key] = value
        return specs

    def _extract_images(self, soup: BeautifulSoup) -> List[str]:
        """Extract product images."""
        images = []
        for img in soup.select('.product-images img'):
            src = img.get('src') or img.get('data-src')
            if src:
                images.append(urljoin(self.base_url, src))
        return images

    def scrape_product_links(self, category_url: str) -> List[str]:
        """Scrape product links from category page."""
        links = []
        html = self._make_request(category_url)

        if html:
            soup = BeautifulSoup(html, 'html.parser')
            # Customize selector based on website structure
            for link in soup.select('.product-link'):
                href = link.get('href')
                if href:
                    full_url = urljoin(self.base_url, href)
                    links.append(full_url)

        return links

    def scrape_products(self, category_urls: List[str]):
        """Scrape products from multiple categories."""
        for category_url in category_urls:
            self.logger.info(f"Scraping category: {category_url}")

            # Get product links
            product_links = self.scrape_product_links(category_url)
            self.logger.info(f"Found {len(product_links)} products")

            # Scrape each product
            for link in product_links:
                self.logger.info(f"Scraping product: {link}")
                html = self._make_request(link)

                if html:
                    product = self._parse_product_page(link, html)
                    if product:
                        self.products.append(product)

    def save_results(self):
        """Save scraped data to files."""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

        # Save to CSV
        csv_file = os.path.join(self.output_dir, f'products_{timestamp}.csv')
        df = pd.DataFrame(self.products)
        df.to_csv(csv_file, index=False)
        self.logger.info(f"Saved results to {csv_file}")

        # Save to JSON
        json_file = os.path.join(self.output_dir, f'products_{timestamp}.json')
        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(self.products, f, indent=2, ensure_ascii=False)
        self.logger.info(f"Saved results to {json_file}")

    def generate_report(self) -> Dict:
        """Generate scraping report."""
        return {
            'total_products': len(self.products),
            'categories_scraped': len(set(p['category'] for p in self.products if 'category' in p)),
            'average_price': sum(p['price'] for p in self.products if p.get('price')) / len(self.products),
            'scraped_at': datetime.now().isoformat()
        }

# Example usage
if __name__ == "__main__":
    # Initialize scraper
    scraper = ProductScraper(
        base_url='https://example.com',
        output_dir='product_data'
    )

    # Define categories to scrape
    categories = [
        'https://example.com/category1',
        'https://example.com/category2'
    ]

    try:
        # Start scraping
        scraper.scrape_products(categories)

        # Save results
        scraper.save_results()

        # Generate and print report
        report = scraper.generate_report()
        print("\nScraping Report")
        print("===============")
        for key, value in report.items():
            print(f"{key}: {value}")

    except Exception as e:
        scraper.logger.error(f"Scraping failed: {e}")

Best Practices

  1. Respect Robots.txt
# @filename: Dockerfile
from urllib.robotparser import RobotFileParser

def can_fetch(url: str) -> bool:
    rp = RobotFileParser()
    rp.set_url(urljoin(url, '/robots.txt'))
    rp.read()
    return rp.can_fetch('*', url)
  1. Rate Limiting
# @filename: utils.py
def rate_limit(delay: float):
    """Decorator for rate limiting."""
    def decorator(func):
        last_called = [0.0]
        def wrapper(*args, **kwargs):
            elapsed = time.time() - last_called[0]
            if elapsed < delay:
                time.sleep(delay - elapsed)
            result = func(*args, **kwargs)
            last_called[0] = time.time()
            return result
        return wrapper
    return decorator
  1. Error Handling
# @filename: utils.py
def safe_request(url: str) -> Optional[str]:
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.Timeout:
        logger.error("Request timed out")
    except requests.HTTPError as e:
        logger.error(f"HTTP error: {e}")
    except requests.RequestException as e:
        logger.error(f"Request failed: {e}")
    return None

Common Patterns

  1. Pagination Handling
# @filename: utils.py
def scrape_paginated_content(base_url: str, max_pages: int = 10):
    results = []
    for page in range(1, max_pages + 1):
        url = f"{base_url}?page={page}"
        content = fetch_page(url)
        if not content:
            break
        page_results = parse_page(content)
        if not page_results:
            break
        results.extend(page_results)
    return results
  1. Data Cleaning
# @filename: utils.py
def clean_text(text: str) -> str:
    """Clean scraped text."""
    import re
    # Remove extra whitespace
    text = ' '.join(text.split())
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

Conclusion

BeautifulSoup provides powerful tools for web scraping:

  • Easy to use API
  • Robust parsing capabilities
  • Good documentation
  • Active community support

Keep exploring BeautifulSoup’s features to build better web scrapers.

Further Reading

Python Programming API Development Advanced
Share:

Continue Reading

Python Data Analysis with Pandas: From Basics to Advanced Techniques

Master data analysis in Python using Pandas, the most powerful data manipulation library. Learn how to load, clean, analyze, and visualize data effectively. This comprehensive guide covers everything from basic operations to advanced analysis techniques with practical examples.

Read article
PythonProgrammingAdvanced

Python Exception Handling and Debugging: A Complete Guide

Exception handling and debugging are essential skills for writing robust Python applications. This comprehensive guide covers everything from basic try-except blocks to advanced debugging techniques, logging, and error tracking. Learn how to handle errors gracefully and debug Python applications effectively.

Read article
PythonProgrammingAdvanced

AI-Assisted Content

This article includes AI-assisted content that has been reviewed for accuracy. Always test code snippets before use.