trax/process_videos_csv.py

95 lines
2.8 KiB
Python

#!/usr/bin/env python3
"""
Script to process videos.csv through the Trax download and transcribe pipeline.
Processes URLs from videos.csv (one per line) and runs batch processing.
"""
import subprocess
import sys
from pathlib import Path
def run_batch_processing(csv_file: str):
"""Run the batch processing pipeline."""
try:
print(f"\n🚀 Starting batch processing for: {csv_file}")
# Step 1: Download and extract metadata
print("\n📥 Step 1: Downloading videos and extracting metadata...")
result = subprocess.run([
"uv", "run", "python", "-m", "src.cli.main", "batch-urls",
csv_file, "--download"
], capture_output=True, text=True)
if result.returncode != 0:
print(f"❌ Error in download step: {result.stderr}")
return False
print("✅ Download and metadata extraction completed")
# Step 2: Transcribe all downloaded videos
print("\n🎤 Step 2: Transcribing videos...")
# Use the batch command to process all downloaded files
result = subprocess.run([
"uv", "run", "python", "-m", "src.cli.main", "batch",
"data/media/downloads", "--v1"
], capture_output=True, text=True)
if result.returncode != 0:
print(f"❌ Error in transcription step: {result.stderr}")
return False
print("✅ Transcription completed")
return True
except Exception as e:
print(f"❌ Error in batch processing: {e}")
return False
def count_urls(csv_file: str) -> int:
"""Count the number of URLs in the CSV file."""
try:
with open(csv_file, 'r') as f:
urls = [line.strip() for line in f if line.strip()]
return len(urls)
except Exception as e:
print(f"❌ Error counting URLs: {e}")
return 0
def main():
"""Main function to process videos.csv."""
csv_file = "videos.csv"
print("🎬 Trax Video Processing Pipeline")
print("=" * 40)
# Check if videos.csv exists
if not Path(csv_file).exists():
print(f"{csv_file} not found!")
return 1
# Count URLs
url_count = count_urls(csv_file)
if url_count == 0:
print("❌ No URLs found in the file!")
return 1
print(f"📋 Found {url_count} URLs in {csv_file}")
# Run batch processing
success = run_batch_processing(csv_file)
if success:
print("\n🎉 Pipeline completed successfully!")
print(f"📊 Processed {url_count} videos")
print("📁 Check the data/ directory for results")
print("📁 Transcripts available in data/exports/")
else:
print("\n❌ Pipeline failed!")
return 1
return 0
if __name__ == "__main__":
sys.exit(main())