Automation &amp; scripting

import requests
from bs4 import BeautifulSoup
import csv
import time
from typing import List, Dict, Optional

class WebScraper:
    def __init__(self, base_url: str, output_file: str = "scraped_data.csv"):
        self.base_url = base_url
        self.output_file = output_file
        self.session = requests.Session()…

11 0 Open

sitemap web-crawler html-parser

Build a Complete Website Sitemap Generator Without External Services

Crawl a website recursively using only Python's standard library to generate a structured sitemap of internal links.

import json
from urllib.parse import urlparse, urljoin
from collections import deque
import urllib.request
import urllib.error
import re
from html.parser import HTMLParser

class SitemapParser(HTMLParser):
    def __init__(self, base_url):
        super().__init__()
        self.base_url = base_url
        self.links …

api web-crawling automation

Build a Python Tool to Find All API Endpoints on a Website

A Python script that crawls a website, searches for common API endpoint patterns in HTML and JavaScript, and returns all discovered public API URLs.

import re
import requests
from urllib.parse import urljoin, urlparse
from collections import deque

def find_api_endpoints(base_url, max_pages=10):
    visited = set()
    queue = deque([base_url])
    api_endpoints = set()
    
    api_patterns = [
        r'/api/[a-zA-Z0-9_/-]+',
        r'/v[0-9]+/[a-zA-Z0-9_/-]+',…

3 0 Open

Build a Website Accessibility Scanner Using Python

Scans a webpage for common accessibility issues like missing alt text, headings, labels, and landmarks using only Python.

accessibility a11y html

import requests
from urllib.parse import urljoin
from html.parser import HTMLParser
import re

class AccessibilityParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.images_without_alt = []
        self.missing_headings = True
        self.has_main_tag = False
        self.label_for_inp…

requests web scraping tech stack

Create a Python Script That Detects Website Technology Stack Automatically

This script sends an HTTP request to a URL and inspects headers and HTML content to identify technologies like servers, frameworks, and JavaScript libraries.

import requests
from re import search

def detect_tech_stack(url):
    tech_stack = []
    try:
        response = requests.get(url, timeout=5, headers={'User-Agent': 'Mozilla/5.0'})
        headers = response.headers
        html = response.text.lower() if response.text else ''

        # Check server header
        …

rss web-scraping beautifulsoup

Discover RSS Feeds From Any Website in Python

Scrape a website's HTML to automatically find all linked RSS or Atom feed URLs using requests, BeautifulSoup, and regex.

import requests
import re
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup

def discover_rss_feeds(url):
    """Discover all RSS/Atom feeds linked from a given website."""
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (compatible; RSSDiscovery/1.0)'}
        response = requests.get(url…

web-scraping requests beautifulsoup

Download Images from a Web Page Automatically in Python

Scrape all images from a webpage, filter by extension, and save them to a local folder using requests and BeautifulSoup.

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os

def download_images(url, output_folder="downloaded_images"):
    """Download all images from a given URL."""
    os.makedirs(output_folder, exist_ok=True)
    
    response = requests.get(url)
    response.raise_for_status()
    …

5 0 Open

web-scraping links requests

Extract All Links from Any Website in Python

Scrape a webpage and extract all absolute HTTP/HTTPS links using requests and regex.

import requests
import re
from urllib.parse import urljoin

def extract_links(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        html = response.text
        # Find all href attributes in anchor tags
        pattern = r'href=["\'](.*?)["\']'
        raw_links = re.findall(p…

8 0 Open

meta tags open graph twitter cards

Extract Every Open Graph and Social Media Meta Tag from Web Pages in Python

A Python script that fetches a webpage and extracts all Open Graph, Twitter Card, Facebook, and Article meta tags using the standard library HTML parser.

from html.parser import HTMLParser
import re
from urllib.request import urlopen
from urllib.parse import urlparse

class MetaExtractor(HTMLParser):
    def __init__(self):
        super().__init__()
        self.meta_tags = []
    
    def handle_starttag(self, tag, attrs):
        if tag == 'meta':
            attrs_…

redirects crawling requests

Find All Redirects on a Website in Python

Crawl a website from a starting URL, follow links within the same domain, and detect every HTTP redirect (301, 302, 303, 307, 308) using requests with redirects disabled.

import requests
from urllib.parse import urljoin, urlparse
from collections import deque

def find_redirects(start_url, max_pages=50):
    visited = set()
    redirects = {}
    queue = deque([start_url])
    
    while queue and len(visited) < max_pages:
        url = queue.popleft()
        if url in visited:
      …

web scraping crawling broken links

Find Broken Image References Across a Website in Python

Crawl internal pages of a website, collect all image source URLs, then check each with HEAD requests to report any that return HTTP 4xx or connection errors.

import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed

def find_all_links(base_url, max_pages=50):
    visited, to_visit = set(), {base_url}
    while to_visit and len(visited) < max_pages:
        url = to_visit.pop()
 …

3 0 Open

How to Build a Cryptocurrency Price Tracker in Python

A continuous Python script that fetches real-time cryptocurrency prices from the CoinGecko API and displays them on a loop.

crypto api automation

import requests
import time

def get_crypto_prices(coin_ids=["bitcoin", "ethereum", "solana"]):
    url = "https://api.coingecko.com/api/v3/simple/price"
    params = {
        "ids": ",".join(coin_ids),
        "vs_currencies": "usd"
    }
    try:
        response = requests.get(url, params=params, timeout=10)
     …

4 0 Open

crawler graph visualization

How to Create a Link Graph Visualization for Any Website in Python

A Python script that crawls a website's internal links, builds a directed graph of parent-child URL relationships, and prints the graph to the console.

import requests
from bs4 import BeautifulSoup
from collections import defaultdict
from urllib.parse import urljoin, urlparse
import sys

def get_links(url, max_links=20):
    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, 'html.parser')
        base_url = f"{urlparse(u…

How to Detect Unused Images in a Project with Python

A Python script that scans a website project folder, identifies all image files, and checks HTML/CSS/JS files to find which images are never referenced.

automation files regex

import os
import re
from pathlib import Path

def find_unused_images(project_path):
    image_exts = {'.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp'}
    used_images = set()
    all_images = set()
    
    # Find all image files
    for root, _, files in os.walk(project_path):
        for file in files:
            …

web scraping monitoring requests

How to Monitor Website Content Changes in Python

This script fetches a webpage's content, computes its SHA-256 hash, and compares it with the last stored hash to detect and alert on changes.