trax/examples/export_example.py

217 lines
8.0 KiB
Python

#!/usr/bin/env python3
"""Real-world example of using the ExportService.
This example demonstrates how to export transcripts in various formats
using realistic transcript data from a tech podcast.
"""
import asyncio
import json
from pathlib import Path
from datetime import datetime, timezone
from src.services.export_service import ExportService, ExportFormat
async def main():
"""Demonstrate export functionality with real transcript data."""
# Create export service
export_service = ExportService(export_dir=Path("examples/exports"))
# Real transcript data from a tech podcast about AI
real_transcript = {
"id": "tech-podcast-episode-42",
"title": "The Future of AI: From GPT-4 to AGI",
"media_file_id": "podcast_episode_42.mp3",
"pipeline_version": "v2",
"content": {
"text": "Welcome to Tech Insights Podcast Episode 42. Today we're diving deep into the world of artificial intelligence, from the latest developments in large language models to the path toward artificial general intelligence. I'm your host Sarah Chen, and joining me today is Dr. Michael Rodriguez, a leading researcher in AI safety and alignment.",
"language": "en",
"duration": 3240.5 # 54 minutes
},
"segments": [
{
"start": 0.0,
"end": 15.2,
"text": "Welcome to Tech Insights Podcast Episode 42. Today we're diving deep into the world of artificial intelligence, from the latest developments in large language models to the path toward artificial general intelligence.",
"confidence": 0.98,
"speaker": "Sarah Chen"
},
{
"start": 15.2,
"end": 25.8,
"text": "I'm your host Sarah Chen, and joining me today is Dr. Michael Rodriguez, a leading researcher in AI safety and alignment.",
"confidence": 0.97,
"speaker": "Sarah Chen"
},
{
"start": 25.8,
"end": 45.3,
"text": "Thank you for having me, Sarah. It's great to be here to discuss these fascinating developments in AI.",
"confidence": 0.96,
"speaker": "Dr. Michael Rodriguez"
},
{
"start": 45.3,
"end": 78.9,
"text": "Let's start with the big question everyone's asking: How close are we to achieving artificial general intelligence, or AGI?",
"confidence": 0.95,
"speaker": "Sarah Chen"
},
{
"start": 78.9,
"end": 120.4,
"text": "That's a complex question, Sarah. While we've made incredible progress with models like GPT-4 and Claude, true AGI is still quite far off. What we have now are narrow AI systems that excel at specific tasks.",
"confidence": 0.94,
"speaker": "Dr. Michael Rodriguez"
},
{
"start": 120.4,
"end": 145.7,
"text": "But the capabilities are growing rapidly. These models can now reason, create, and even show glimpses of what we might call understanding.",
"confidence": 0.93,
"speaker": "Dr. Michael Rodriguez"
}
],
"confidence_scores": [0.98, 0.97, 0.96, 0.95, 0.94, 0.93],
"speaker_info": {
"speakers": ["Sarah Chen", "Dr. Michael Rodriguez"],
"speaker_count": 2,
"speaker_roles": {
"Sarah Chen": "Host",
"Dr. Michael Rodriguez": "Guest Expert"
}
},
"accuracy": 0.955,
"word_count": 156,
"processing_time": 45.2,
"model_used": "whisper-1",
"model_config": {
"temperature": 0.0,
"language": "en",
"task": "transcribe"
},
"created_at": "2024-01-15T14:30:00Z",
"updated_at": "2024-01-15T14:35:00Z"
}
print("🚀 Exporting transcript in multiple formats...")
print(f"📝 Transcript: {real_transcript['title']}")
print(f"⏱️ Duration: {real_transcript['content']['duration'] / 60:.1f} minutes")
print(f"👥 Speakers: {', '.join(real_transcript['speaker_info']['speakers'])}")
print()
# Export in all formats
formats = [
(ExportFormat.JSON, "Full transcript data with metadata"),
(ExportFormat.TXT, "Clean plain text for reading"),
(ExportFormat.SRT, "Video subtitles with timestamps"),
(ExportFormat.MARKDOWN, "Formatted document with speakers and metadata")
]
exported_files = []
for format_enum, description in formats:
try:
print(f"📤 Exporting as {format_enum.value.upper()}: {description}")
# Generate filename based on transcript title
safe_title = real_transcript['title'].replace(' ', '_').replace(':', '').lower()
filename = f"{safe_title}.{format_enum.value}"
output_path = await export_service.export_transcript(
transcript=real_transcript,
format=format_enum,
output_path=Path(f"examples/exports/{filename}")
)
exported_files.append(output_path)
print(f" ✅ Saved to: {output_path}")
# Show file size
file_size = output_path.stat().st_size
print(f" 📊 File size: {file_size:,} bytes")
except Exception as e:
print(f" ❌ Error: {str(e)}")
print()
# Demonstrate batch export
print("🔄 Demonstrating batch export...")
# Create a second transcript for batch export
second_transcript = {
"id": "tech-podcast-episode-43",
"title": "Cybersecurity in the AI Era",
"content": {
"text": "In this episode, we explore how AI is changing the cybersecurity landscape.",
"language": "en",
"duration": 1800.0
},
"segments": [
{
"start": 0.0,
"end": 30.0,
"text": "In this episode, we explore how AI is changing the cybersecurity landscape.",
"confidence": 0.95,
"speaker": "Sarah Chen"
}
],
"created_at": "2024-01-16T10:00:00Z"
}
batch_transcripts = [real_transcript, second_transcript]
try:
batch_results = await export_service.batch_export(
transcripts=batch_transcripts,
format=ExportFormat.JSON,
output_dir=Path("examples/exports/batch")
)
print(f" ✅ Batch export completed: {len([r for r in batch_results if r is not None])}/{len(batch_transcripts)} successful")
for i, result in enumerate(batch_results):
if result:
print(f" 📄 {result.name}")
else:
print(f" ❌ Failed to export transcript {i+1}")
except Exception as e:
print(f" ❌ Batch export error: {str(e)}")
print()
# Show sample content from each format
print("📖 Sample content from each format:")
print("=" * 50)
for output_path in exported_files:
if output_path.exists():
print(f"\n📄 {output_path.name}:")
print("-" * 30)
with open(output_path, 'r', encoding='utf-8') as f:
content = f.read()
# Show first 300 characters
preview = content[:300]
if len(content) > 300:
preview += "..."
print(preview)
print()
print("🎉 Export demonstration completed!")
print(f"📁 All files saved to: {export_service.export_dir}")
if __name__ == "__main__":
# Create exports directory
Path("examples/exports").mkdir(parents=True, exist_ok=True)
# Run the example
asyncio.run(main())