soundwave/backend/audio/tasks_lyrics.py

217 lines
6.6 KiB
Python

"""Celery tasks for automatic lyrics fetching"""
from celery import shared_task
from django.utils import timezone
from datetime import timedelta
import logging
logger = logging.getLogger(__name__)
@shared_task(name="audio.fetch_lyrics_for_audio")
def fetch_lyrics_for_audio(audio_youtube_id: str, force: bool = False):
"""
Fetch lyrics for a single audio track
Args:
audio_youtube_id: YouTube ID of the audio
force: Force fetch even if already attempted
"""
from audio.models import Audio
from audio.lyrics_service import LyricsService
try:
audio = Audio.objects.get(youtube_id=audio_youtube_id)
service = LyricsService()
service.fetch_and_store_lyrics(audio, force=force)
logger.info(f"Fetched lyrics for {audio.title}")
return {"status": "success", "youtube_id": audio_youtube_id}
except Audio.DoesNotExist:
logger.error(f"Audio not found: {audio_youtube_id}")
return {"status": "error", "error": "Audio not found"}
except Exception as e:
logger.error(f"Error fetching lyrics for {audio_youtube_id}: {e}")
return {"status": "error", "error": str(e)}
@shared_task(name="audio.fetch_lyrics_batch")
def fetch_lyrics_batch(audio_ids: list, delay_seconds: int = 2):
"""
Fetch lyrics for multiple audio tracks with delay between requests
Args:
audio_ids: List of YouTube IDs
delay_seconds: Delay between requests to avoid rate limiting
"""
import time
from audio.models import Audio
from audio.lyrics_service import LyricsService
service = LyricsService()
results = {
'success': 0,
'failed': 0,
'skipped': 0,
}
for youtube_id in audio_ids:
try:
audio = Audio.objects.get(youtube_id=youtube_id)
service.fetch_and_store_lyrics(audio, force=False)
results['success'] += 1
logger.info(f"Fetched lyrics for {audio.title}")
except Audio.DoesNotExist:
results['skipped'] += 1
logger.warning(f"Audio not found: {youtube_id}")
except Exception as e:
results['failed'] += 1
logger.error(f"Error fetching lyrics for {youtube_id}: {e}")
# Delay to avoid rate limiting
if delay_seconds > 0:
time.sleep(delay_seconds)
return results
@shared_task(name="audio.auto_fetch_lyrics")
def auto_fetch_lyrics(limit: int = 50, max_attempts: int = 3):
"""
Automatically fetch lyrics for audio without lyrics
This task should be scheduled to run periodically (e.g., every hour)
Args:
limit: Maximum number of tracks to process
max_attempts: Skip tracks that have been attempted this many times
"""
from audio.models import Audio
from audio.models_lyrics import Lyrics
from audio.lyrics_service import LyricsService
# Find audio without lyrics or with failed attempts
audio_without_lyrics = Audio.objects.filter(
downloaded=True
).exclude(
lyrics__fetch_attempted=True,
lyrics__fetch_attempts__gte=max_attempts
)[:limit]
if not audio_without_lyrics:
logger.info("No audio tracks need lyrics fetching")
return {"status": "no_work", "message": "No tracks need lyrics"}
service = LyricsService()
results = {
'processed': 0,
'success': 0,
'failed': 0,
}
for audio in audio_without_lyrics:
try:
lyrics = service.fetch_and_store_lyrics(audio, force=False)
results['processed'] += 1
if lyrics.has_lyrics:
results['success'] += 1
else:
results['failed'] += 1
except Exception as e:
logger.error(f"Error in auto-fetch for {audio.title}: {e}")
results['failed'] += 1
# Small delay to be nice to the API
import time
time.sleep(1)
logger.info(f"Auto-fetch completed: {results}")
return results
@shared_task(name="audio.cleanup_lyrics_cache")
def cleanup_lyrics_cache(days_old: int = 30):
"""
Clean up old lyrics cache entries
Args:
days_old: Remove cache entries older than this many days
"""
from audio.models_lyrics import LyricsCache
from django.utils import timezone
from datetime import timedelta
cutoff_date = timezone.now() - timedelta(days=days_old)
# Delete old not_found entries
deleted_count = LyricsCache.objects.filter(
not_found=True,
cached_date__lt=cutoff_date
).delete()[0]
# Delete old unused entries (not accessed in the last N days)
deleted_unused = LyricsCache.objects.filter(
last_accessed__lt=cutoff_date,
access_count=0
).delete()[0]
logger.info(f"Cleaned up {deleted_count} not_found and {deleted_unused} unused cache entries")
return {
'deleted_not_found': deleted_count,
'deleted_unused': deleted_unused,
}
@shared_task(name="audio.refetch_failed_lyrics")
def refetch_failed_lyrics(days_old: int = 7, limit: int = 20):
"""
Retry fetching lyrics for tracks that failed before
Args:
days_old: Retry tracks that failed more than this many days ago
limit: Maximum number of tracks to retry
"""
from audio.models_lyrics import Lyrics
from audio.lyrics_service import LyricsService
from django.utils import timezone
from datetime import timedelta
cutoff_date = timezone.now() - timedelta(days=days_old)
# Find tracks that failed but haven't been tried recently
failed_lyrics = Lyrics.objects.filter(
fetch_attempted=True,
synced_lyrics='',
plain_lyrics='',
is_instrumental=False,
fetched_date__lt=cutoff_date,
fetch_attempts__lt=5 # Don't retry if attempted 5+ times
)[:limit]
service = LyricsService()
results = {
'retried': 0,
'success': 0,
'failed': 0,
}
for lyrics in failed_lyrics:
try:
updated = service.fetch_and_store_lyrics(lyrics.audio, force=True)
results['retried'] += 1
if updated.has_lyrics:
results['success'] += 1
else:
results['failed'] += 1
except Exception as e:
logger.error(f"Error retrying lyrics for {lyrics.audio.title}: {e}")
results['failed'] += 1
import time
time.sleep(2) # Be nice to the API
logger.info(f"Refetch completed: {results}")
return results