All Projects
advanced
Web Scraper
A web scraping tool that extracts data from websites, processes it, and saves results in various formats.
Features
- URL-based data extraction
- HTML parsing
- Data cleaning
- Export to CSV/JSON
- Rate limiting
Technologies Used
requestsBeautifulSoupCSV moduleJSONRegular Expressions
Explanation
Uses the requests library to fetch web pages and BeautifulSoup for parsing HTML. Extracts specific data elements and saves them in structured formats.
# Web Scraper - Requires: pip install requests beautifulsoup4
# This is a template - modify targets and selectors for your needs
import csv
import json
import time
try:
import requests
from bs4 import BeautifulSoup
HAS_DEPS = True
except ImportError:
HAS_DEPS = False
class WebScraper:
def __init__(self):
self.data = []
self.headers = {
"User-Agent": "Mozilla/5.0 (Educational Scraper)"
}
def fetch_page(self, url):
"""Fetch a web page with rate limiting."""
try:
time.sleep(1) # Be respectful
response = requests.get(url, headers=self.headers, timeout=10)
response.raise_for_status()
return BeautifulSoup(response.text, "html.parser")
except Exception as e:
print(f"Error fetching {url}: {e}")
return None
def scrape_quotes(self, url="http://quotes.toscrape.com"):
"""Example: Scrape quotes from quotes.toscrape.com"""
soup = self.fetch_page(url)
if not soup:
return
quotes = soup.find_all("div", class_="quote")
for quote in quotes:
text = quote.find("span", class_="text").get_text()
author = quote.find("small", class_="author").get_text()
tags = [tag.get_text() for tag in quote.find_all("a", class_="tag")]
self.data.append({
"text": text,
"author": author,
"tags": tags
})
print(f"Scraped {len(quotes)} quotes!")
def save_csv(self, filename="output.csv"):
if not self.data:
print("No data to save!")
return
with open(filename, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=self.data[0].keys())
writer.writeheader()
for row in self.data:
row_copy = {k: str(v) for k, v in row.items()}
writer.writerow(row_copy)
print(f"Saved to {filename}")
def save_json(self, filename="output.json"):
with open(filename, "w", encoding="utf-8") as f:
json.dump(self.data, f, indent=2, ensure_ascii=False)
print(f"Saved to {filename}")
def main():
if not HAS_DEPS:
print("Install dependencies: pip install requests beautifulsoup4")
return
scraper = WebScraper()
scraper.scrape_quotes()
scraper.save_json()
scraper.save_csv()
print("\nSample data:")
for item in scraper.data[:3]:
print(f' "{item["text"][:50]}..." - {item["author"]}')
main()