advanced

Web Scraper

A web scraping tool that extracts data from websites, processes it, and saves results in various formats.

Features

URL-based data extraction
HTML parsing
Data cleaning
Export to CSV/JSON
Rate limiting

Technologies Used

requestsBeautifulSoupCSV moduleJSONRegular Expressions

Explanation

Uses the requests library to fetch web pages and BeautifulSoup for parsing HTML. Extracts specific data elements and saves them in structured formats.

Source Code

# Web Scraper - Requires: pip install requests beautifulsoup4
# This is a template - modify targets and selectors for your needs

import csv
import json
import time

try:
    import requests
    from bs4 import BeautifulSoup
    HAS_DEPS = True
except ImportError:
    HAS_DEPS = False

class WebScraper:
    def __init__(self):
        self.data = []
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Educational Scraper)"
        }

    def fetch_page(self, url):
        """Fetch a web page with rate limiting."""
        try:
            time.sleep(1)  # Be respectful
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()
            return BeautifulSoup(response.text, "html.parser")
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            return None

    def scrape_quotes(self, url="http://quotes.toscrape.com"):
        """Example: Scrape quotes from quotes.toscrape.com"""
        soup = self.fetch_page(url)
        if not soup:
            return

        quotes = soup.find_all("div", class_="quote")
        for quote in quotes:
            text = quote.find("span", class_="text").get_text()
            author = quote.find("small", class_="author").get_text()
            tags = [tag.get_text() for tag in quote.find_all("a", class_="tag")]
            self.data.append({
                "text": text,
                "author": author,
                "tags": tags
            })
        print(f"Scraped {len(quotes)} quotes!")

    def save_csv(self, filename="output.csv"):
        if not self.data:
            print("No data to save!")
            return
        with open(filename, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=self.data[0].keys())
            writer.writeheader()
            for row in self.data:
                row_copy = {k: str(v) for k, v in row.items()}
                writer.writerow(row_copy)
        print(f"Saved to {filename}")

    def save_json(self, filename="output.json"):
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(self.data, f, indent=2, ensure_ascii=False)
        print(f"Saved to {filename}")

def main():
    if not HAS_DEPS:
        print("Install dependencies: pip install requests beautifulsoup4")
        return

    scraper = WebScraper()
    scraper.scrape_quotes()
    scraper.save_json()
    scraper.save_csv()

    print("\nSample data:")
    for item in scraper.data[:3]:
        print(f'  "{item["text"][:50]}..." - {item["author"]}')

main()