#!/usr/bin/env python3
"""
fetch_artworks.py
Download artwork images for every entry in artworks.json → ./images/artworks/…

How it works
------------
1. Wikipedia API  : get the Wikidata Q-id for the artwork page
2. Wikidata API   : fetch property P18 (image filename) for that Q-id
3. Commons file   : resolve File:… to a real image URL via Special:FilePath
4. Save locally   : mkdir -p images/artworks && write binary data

If anything fails we log and continue.
"""

import json
import os
import pathlib
import sys
import time
from urllib.parse import quote

import requests
from tqdm import tqdm

WIKI_API    = "https://en.wikipedia.org/w/api.php"
WIKIDATA_API = "https://www.wikidata.org/wiki/Special:EntityData/{}.json"
COMMONS_FILE = "https://commons.wikimedia.org/wiki/Special:FilePath/{}"

HEADERS = {"User-Agent": "TimelineBot/0.1 (https://github.com/timeline)"}

# Manual overrides for artworks that need specific Wikipedia page titles or direct Wikidata IDs
# Can also use direct URLs for problematic images
ARTWORK_OVERRIDES = {
    "Sistine Chapel Ceiling": "Sistine Chapel ceiling",
    "Basket of Fruit": "Basket of Fruit (Caravaggio)",
    "The Great Wave off Kanagawa": "The Great Wave off Kanagawa",
    "The Disquieting Muses": "The Disquieting Muses",
    "Nighthawks": "Nighthawks (Hopper)",
    "Liberty Leading the People": "Liberty Leading the People",
    "David with the Head of Goliath": "David with the Head of Goliath (Caravaggio)",
    "Self-Portrait with Thorn Necklace and Hummingbird": "Self-Portrait with Thorn Necklace and Hummingbird",
    "Ophelia": "Ophelia (painting)",
    "Irises": "Irises (painting)",
    "Girl with a Pearl Earring": "Girl with a Pearl Earring",
    "Wheat Field with Cypresses": "Wheat Field with Cypresses",
    "The Water Lily Pond": "Water Lilies (Monet)",
    "Vitruvian Man": "Vitruvian Man",
}

# Direct image URLs for artworks that can't be found via Wikidata or need specific versions
DIRECT_IMAGE_URLS = {
    "David with the Head of Goliath": "https://upload.wikimedia.org/wikipedia/commons/f/f6/David_with_the_Head_of_Goliath-Caravaggio_%281610%29.jpg",
    "Self-Portrait with Thorn Necklace and Hummingbird": "https://upload.wikimedia.org/wikipedia/en/1/1e/Frida_Kahlo_%28self_portrait%29.jpg",
    "The Disquieting Muses": "https://upload.wikimedia.org/wikipedia/en/d/df/The_Disquieting_Muses.jpg",
    "Irises": "https://upload.wikimedia.org/wikipedia/commons/thumb/3/3e/Irises-Vincent_van_Gogh.jpg/800px-Irises-Vincent_van_Gogh.jpg",
    "Lilac Irises": "https://www.artchive.com/wp-content/uploads/2024/04/lilac-irisesclaude-monet-1914-1917.jpg",
    "Sunflowers": "https://upload.wikimedia.org/wikipedia/commons/4/46/Vincent_Willem_van_Gogh_127.jpg",
    "Wheat Field with Cypresses": "https://upload.wikimedia.org/wikipedia/commons/c/ce/Wheat-Field-with-Cypresses-%281889%29-Vincent-van-Gogh-Met.jpg",
    "Monet's Garden in Giverny": "https://upload.wikimedia.org/wikipedia/commons/b/b1/Monet_-_Monets_Garten_in_Giverny.jpg",
    "The School of Athens": "https://upload.wikimedia.org/wikipedia/commons/4/49/%22The_School_of_Athens%22_by_Raffaello_Sanzio_da_Urbino.jpg",
}

# --------------------------------------------------------------------------- #
# Helper functions
# --------------------------------------------------------------------------- #
def wikipedia_to_qid(title: str) -> str | None:
    """Return the Wikidata Q-identifier for a Wikipedia page title."""
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "pageprops",
        "redirects": 1,
    }
    try:
        r = requests.get(WIKI_API, params=params, headers=HEADERS, timeout=15)
        r.raise_for_status()
        pages = r.json()["query"]["pages"].values()
        for page in pages:
            return page.get("pageprops", {}).get("wikibase_item")  # e.g. "Q937"
    except Exception as e:
        print(f"Error fetching Wikipedia data: {e}")
    return None


def qid_to_image_filename(qid: str) -> str | None:
    """Return the Commons file name (e.g. 'Mona Lisa.jpg')."""
    try:
        r = requests.get(WIKIDATA_API.format(qid), headers=HEADERS, timeout=15)
        r.raise_for_status()
        entity = r.json()["entities"][qid]
        claims = entity["claims"]
        if "P18" not in claims:
            return None
        # Use the first image in P18
        return claims["P18"][0]["mainsnak"]["datavalue"]["value"]
    except Exception as e:
        print(f"Error fetching Wikidata: {e}")
    return None


def download_commons_file(filename: str, dest_path: pathlib.Path) -> bool:
    """
    Resolve image via Special:FilePath and stream to dest_path.
    Returns True if saved, False if already on disk.
    """
    if dest_path.exists():
        return False

    # 1. MediaWiki expects underscores, not spaces
    fname = filename.replace(" ", "_")

    # 2. URL-encode *except* underscores, parentheses, apostrophes, dots, dashes
    #    (keeps filenames readable and avoids double-encoding)
    safe = "_().'-"
    url = f"https://commons.wikimedia.org/wiki/Special:FilePath/{quote(fname, safe=safe)}"

    try:
        with requests.get(url, headers=HEADERS, stream=True, timeout=30) as r:
            r.raise_for_status()          # will now 302 → 200 instead of 404
            dest_path.parent.mkdir(parents=True, exist_ok=True)
            with open(dest_path, "wb") as fp:
                for chunk in r.iter_content(chunk_size=8192):
                    fp.write(chunk)
        return True
    except Exception as e:
        print(f"Error downloading {filename}: {e}")
        return False


# --------------------------------------------------------------------------- #
# Main script
# --------------------------------------------------------------------------- #
def download_direct_url(url: str, dest_path: pathlib.Path) -> bool:
    """Download image from a direct URL."""
    if dest_path.exists():
        return False
    
    try:
        with requests.get(url, headers=HEADERS, stream=True, timeout=30) as r:
            r.raise_for_status()
            dest_path.parent.mkdir(parents=True, exist_ok=True)
            with open(dest_path, "wb") as fp:
                for chunk in r.iter_content(chunk_size=8192):
                    fp.write(chunk)
        return True
    except Exception as e:
        print(f"Error downloading from URL: {e}")
        return False


def main(json_path="artworks.json"):
    artworks = json.loads(pathlib.Path(json_path).read_text())
    saved, skipped, err = 0, 0, 0

    for artwork in tqdm(artworks, unit="artwork"):
        title = artwork["title"]
        # Output to artworks-original directory instead of artworks
        image_path = artwork["image"].replace("/images/artworks/", "/images/artworks-original/")
        out_path = pathlib.Path("." + image_path)  # strip leading slash

        # Check if we have a direct URL first
        if title in DIRECT_IMAGE_URLS:
            try:
                if download_direct_url(DIRECT_IMAGE_URLS[title], out_path):
                    saved += 1
                    tqdm.write(f"✅ Downloaded (direct): {title}")
                else:
                    skipped += 1
                time.sleep(0.3)
                continue
            except Exception as e:
                tqdm.write(f"❌ {title} (direct): {e}")
                err += 1
                continue

        # Use override if available, otherwise use the title as-is
        wiki_title = ARTWORK_OVERRIDES.get(title, title)

        try:
            qid = wikipedia_to_qid(wiki_title)
            if not qid:
                tqdm.write(f"⚠️  {title}: no Wikidata Q-id");  err += 1;  continue

            filename = qid_to_image_filename(qid)
            if not filename:
                tqdm.write(f"⚠️  {title}: no P18 image");       err += 1;  continue

            if download_commons_file(filename, out_path):
                saved += 1
                tqdm.write(f"✅ Downloaded: {title}")
            else:
                skipped += 1  # already existed
            time.sleep(0.3)  # be polite to the APIs
        except Exception as e:
            tqdm.write(f"❌ {title}: {e}")
            err += 1

    print(f"\nDone: {saved} downloaded, {skipped} skipped, {err} errors.")


if __name__ == "__main__":
    # Change to src/lib/data directory where artworks.json should be
    script_dir = pathlib.Path(__file__).parent
    data_dir = script_dir.parent / "src" / "lib" / "data"
    
    if not data_dir.exists():
        sys.exit(f"Data directory not found: {data_dir}")
    
    os.chdir(data_dir)
    
    if not pathlib.Path("artworks.json").exists():
        sys.exit("artworks.json not found in src/lib/data directory.")
    
    main("artworks.json")