95 lines
2.8 KiB
Python
95 lines
2.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Script to process videos.csv through the Trax download and transcribe pipeline.
|
|
Processes URLs from videos.csv (one per line) and runs batch processing.
|
|
"""
|
|
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
def run_batch_processing(csv_file: str):
|
|
"""Run the batch processing pipeline."""
|
|
try:
|
|
print(f"\n🚀 Starting batch processing for: {csv_file}")
|
|
|
|
# Step 1: Download and extract metadata
|
|
print("\n📥 Step 1: Downloading videos and extracting metadata...")
|
|
result = subprocess.run([
|
|
"uv", "run", "python", "-m", "src.cli.main", "batch-urls",
|
|
csv_file, "--download"
|
|
], capture_output=True, text=True)
|
|
|
|
if result.returncode != 0:
|
|
print(f"❌ Error in download step: {result.stderr}")
|
|
return False
|
|
|
|
print("✅ Download and metadata extraction completed")
|
|
|
|
# Step 2: Transcribe all downloaded videos
|
|
print("\n🎤 Step 2: Transcribing videos...")
|
|
# Use the batch command to process all downloaded files
|
|
result = subprocess.run([
|
|
"uv", "run", "python", "-m", "src.cli.main", "batch",
|
|
"data/media/downloads", "--v1"
|
|
], capture_output=True, text=True)
|
|
|
|
if result.returncode != 0:
|
|
print(f"❌ Error in transcription step: {result.stderr}")
|
|
return False
|
|
|
|
print("✅ Transcription completed")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error in batch processing: {e}")
|
|
return False
|
|
|
|
def count_urls(csv_file: str) -> int:
|
|
"""Count the number of URLs in the CSV file."""
|
|
try:
|
|
with open(csv_file, 'r') as f:
|
|
urls = [line.strip() for line in f if line.strip()]
|
|
return len(urls)
|
|
except Exception as e:
|
|
print(f"❌ Error counting URLs: {e}")
|
|
return 0
|
|
|
|
def main():
|
|
"""Main function to process videos.csv."""
|
|
csv_file = "videos.csv"
|
|
|
|
print("🎬 Trax Video Processing Pipeline")
|
|
print("=" * 40)
|
|
|
|
# Check if videos.csv exists
|
|
if not Path(csv_file).exists():
|
|
print(f"❌ {csv_file} not found!")
|
|
return 1
|
|
|
|
# Count URLs
|
|
url_count = count_urls(csv_file)
|
|
if url_count == 0:
|
|
print("❌ No URLs found in the file!")
|
|
return 1
|
|
|
|
print(f"📋 Found {url_count} URLs in {csv_file}")
|
|
|
|
# Run batch processing
|
|
success = run_batch_processing(csv_file)
|
|
|
|
if success:
|
|
print("\n🎉 Pipeline completed successfully!")
|
|
print(f"📊 Processed {url_count} videos")
|
|
print("📁 Check the data/ directory for results")
|
|
print("📁 Transcripts available in data/exports/")
|
|
else:
|
|
print("\n❌ Pipeline failed!")
|
|
return 1
|
|
|
|
return 0
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|