youtube-automation/one_time_scan.py

254 lines
9.0 KiB
Python
Executable File

#!/usr/bin/env python3
"""
One-time scan to populate ALL existing YouTube media items with thumbnails
"""
import sys
import logging
from datetime import datetime
from typing import Dict, List
# Add src directory to path
sys.path.append('src')
from config import BATCH_SIZE, LOG_LEVEL
from directus_client import DirectusClient
from youtube_processor import YouTubeProcessor
class OneTimeThumbnailScanner:
"""One-time scanner to populate all YouTube thumbnails"""
def __init__(self):
self.directus_client = DirectusClient()
self.youtube_processor = YouTubeProcessor()
# Statistics
self.stats = {
'items_found': 0,
'items_processed': 0,
'items_succeeded': 0,
'items_failed': 0,
'items_skipped': 0,
'start_time': datetime.now()
}
self.setup_logging()
def setup_logging(self):
"""Configure logging"""
log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(
level=getattr(logging, LOG_LEVEL.upper()),
format=log_format,
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler('/tmp/youtube_one_time_scan.log')
]
)
self.logger = logging.getLogger(__name__)
self.logger.info("🎬 Starting one-time YouTube thumbnail scan...")
def get_all_youtube_items(self) -> List[Dict]:
"""Get ALL YouTube items (with and without thumbnails) for complete scan"""
try:
import requests
import json
from config import DIRECTUS_ITEMS_URL
# Query for ALL YouTube items regardless of thumbnail status
filter_json = json.dumps({
"_and": [
{
"_or": [
{"type": {"_eq": "youtube_video"}},
{"type": {"_eq": "youtube"}}
]
},
{"url": {"_nnull": True}}
]
})
all_items = []
offset = 0
limit = 100 # Larger batch for scanning
while True:
filter_params = {
"filter": filter_json,
"limit": limit,
"offset": offset,
"fields": "id,url,type,title,youtube_thumb"
}
response = requests.get(
f"{DIRECTUS_ITEMS_URL}/media_items",
headers=self.directus_client.headers,
params=filter_params,
timeout=30
)
if response.status_code == 200:
data = response.json()
items = data.get('data', [])
if not items:
break
all_items.extend(items)
offset += limit
self.logger.info(f"Fetched {len(items)} items (total: {len(all_items)})")
else:
self.logger.error(f"Failed to get media items: {response.status_code} - {response.text}")
break
self.stats['items_found'] = len(all_items)
self.logger.info(f"Found {len(all_items)} total YouTube items")
return all_items
except Exception as e:
self.logger.error(f"Error getting all YouTube items: {e}")
return []
def process_media_item(self, item: Dict) -> bool:
"""Process a single media item"""
item_id = item.get('id')
item_url = item.get('url')
item_title = item.get('title', f"Media Item {item_id}")
existing_thumb = item.get('youtube_thumb')
# Skip if already has thumbnail
if existing_thumb:
self.logger.info(f"⏭️ Item {item_id} already has thumbnail: {existing_thumb}")
self.stats['items_skipped'] += 1
return True
self.logger.info(f"🔄 Processing item {item_id}: {item_title}")
try:
# Extract video ID
video_id = self.youtube_processor.extract_video_id(item_url)
if not video_id:
self.logger.error(f"Could not extract video ID from URL: {item_url}")
return False
# Download thumbnail
thumbnail_data, filename = self.youtube_processor.download_best_thumbnail(video_id)
if not thumbnail_data or not filename:
self.logger.error(f"Could not download thumbnail for video: {video_id}")
return False
# Upload to Directus
file_id = self.directus_client.upload_file(
thumbnail_data,
filename,
title=f"YouTube Thumbnail - {video_id}"
)
if not file_id:
self.logger.error(f"Could not upload thumbnail for video: {video_id}")
return False
# Update media item
success = self.directus_client.update_media_item_thumbnail(item_id, file_id)
if success:
self.logger.info(f"✅ Successfully processed item {item_id} -> thumbnail {file_id}")
return True
else:
self.logger.error(f"❌ Failed to update media item {item_id}")
return False
except Exception as e:
self.logger.error(f"❌ Error processing item {item_id}: {e}")
return False
def print_final_statistics(self):
"""Print final scan statistics"""
uptime = datetime.now() - self.stats['start_time']
print(f"\n📊 One-Time Scan Complete!")
print(f"=" * 40)
print(f" Duration: {uptime}")
print(f" Items Found: {self.stats['items_found']}")
print(f" Items Processed: {self.stats['items_processed']}")
print(f" Already Had Thumbnails: {self.stats['items_skipped']}")
print(f" Successfully Added: {self.stats['items_succeeded']}")
print(f" Failed: {self.stats['items_failed']}")
if self.stats['items_processed'] > 0:
success_rate = (self.stats['items_succeeded'] / self.stats['items_processed']) * 100
print(f" Success Rate: {success_rate:.1f}%")
total_with_thumbs = self.stats['items_skipped'] + self.stats['items_succeeded']
coverage = (total_with_thumbs / self.stats['items_found']) * 100 if self.stats['items_found'] > 0 else 0
print(f" Total Coverage: {coverage:.1f}% ({total_with_thumbs}/{self.stats['items_found']})")
print("")
def run(self):
"""Main scanning process"""
print("🎬 YouTube Thumbnail One-Time Scan")
print("==================================")
print("This will scan ALL YouTube media items and populate missing thumbnails")
print("")
try:
# Get all YouTube items
self.logger.info("🔍 Scanning for all YouTube media items...")
items = self.get_all_youtube_items()
if not items:
self.logger.info("No YouTube items found")
return
# Process each item
self.logger.info(f"📋 Processing {len(items)} YouTube items...")
for i, item in enumerate(items, 1):
print(f"\n[{i}/{len(items)}] Processing: {item.get('title', 'Untitled')}")
# Skip if already has thumbnail
if item.get('youtube_thumb'):
self.stats['items_skipped'] += 1
continue
success = self.process_media_item(item)
# Update statistics
self.stats['items_processed'] += 1
if success:
self.stats['items_succeeded'] += 1
else:
self.stats['items_failed'] += 1
# Progress update every 5 items
if i % 5 == 0:
print(f"Progress: {i}/{len(items)} items checked")
# Final statistics
self.print_final_statistics()
except KeyboardInterrupt:
self.logger.info("Scan interrupted by user")
self.print_final_statistics()
except Exception as e:
self.logger.error(f"Scan error: {e}")
self.print_final_statistics()
raise
def main():
"""Entry point"""
try:
scanner = OneTimeThumbnailScanner()
scanner.run()
except Exception as e:
print(f"❌ Failed to start scan: {e}")
sys.exit(1)
if __name__ == "__main__":
main()