This commit is contained in:
Antonio J. Delgado 2024-11-15 15:34:30 +02:00
parent d19517d24c
commit 63bab306e9
2 changed files with 241 additions and 1 deletions

View file

@ -7,15 +7,28 @@
import sys
import os
import re
import json
import logging
from logging.handlers import SysLogHandler
import click
import click_config_file
import requests
import feedparser
import yt_dlp
class GetYoutubeVideos:
'''Get YouTube videos from a series of channels'''
def __init__(self, **kwargs):
self.time_duration_units = (
('week', 60*60*24*7),
('day', 60*60*24),
('hour', 60*60),
('min', 60),
('sec', 1)
)
self.config = kwargs
if 'log_file' not in kwargs or kwargs['log_file'] is None:
self.config['log_file'] = os.path.join(
@ -30,6 +43,186 @@ class GetYoutubeVideos:
'get_youtube_videos.log'
)
self._init_log()
if os.path.exists(self.config['downloaded_database']):
with open(self.config['downloaded_database'], 'r', encoding='utf-8') as db_file:
self.downloaded_items = db_file.read().split('\n')
else:
self.downloaded_items = []
self.session = requests.Session()
self._process_channels()
def _process_channels(self):
self.total_count = 0
self.channels_count = 0
for channel in self.config['channels']:
self.channels_count += 1
self._log.debug(
"Processing channel %s/%s '%s'...",
self.channels_count,
len(self.config['channels']),
channel
)
feed = feedparser.parse(
f"https://www.youtube.com/feeds/videos.xml?channel_id={channel}"
)
self.channel_count = 0
self.entries_count = 0
for entry in feed['entries']:
self.entries_count += 1
self._log.debug(
"Processing video entry %s/%s...",
self.entries_count,
len(feed['entries'])
)
result=re.search('v=([0-9a-zA-Z-_]{11})',entry['link'])
if result:
video_id=result.group(1)
if video_id not in self.downloaded_items:
# print(json.dumps(entry, indent=2))
uri=f"https://www.youtube.com/watch?v={video_id}"
ydl_opts = {
'logger': self._log,
'progress_hooks': [self._yt_progress_hook],
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
try:
raw_video_info = ydl.extract_info(uri, download=False)
except yt_dlp.utils.DownloadError as error:
self._log.debug(
"%s Skipping video.",
error
)
break
video_info = ydl.sanitize_info(raw_video_info)
info_filename = os.path.join(self.config['download_dir'], f"{video_id}.json")
self._log.debug(
"Writting information in to file '%s'",
info_filename
)
with open(info_filename, 'w', encoding='utf-8') as info_file:
json.dump(video_info, info_file, indent=2)
if video_info['live_status'] == 'is_live':
self._log.debug(
"Skipping video '%s' as it's a live video",
video_info.get('title', '?')
)
self.downloaded_items.append(video_id)
with open(
self.config['downloaded_database'], 'w', encoding='utf-8'
) as db_file:
for item in self.downloaded_items:
db_file.write(f"{item}\n")
break
if video_info['was_live']:
self._log.debug(
"Skipping video '%s' as it was a live video",
video_info.get('title', '?')
)
self._save_downloaded_items(video_id)
break
if ('duration' in video_info and
video_info['duration'] > self.config['max_length']):
self._log.debug(
"Skipping video '%s' as it was larger than %s",
video_info.get('title', '?'),
self._human_time_duration(self.config['max_length'])
)
self._save_downloaded_items(video_id)
break
if 'duration' not in video_info:
self._log.debug(
"Skipping video '%s' as there is no video duration",
video_info.get('title', '?')
)
self._save_downloaded_items(video_id)
break
self._log.info(
"Downloading. Filename: '%s'. Video ID: '%s'. Video URL: '%s'. Duration: %s. Counts: %s/%s - %s/%s)",
video_info.get('title', '?'),
video_id,
uri,
self._human_time_duration(video_info.get('duration', '-1')),
self.total_count+1,
self.config['total_limit'],
self.channel_count+1,
self.config['channel_limit']
)
ydl_opts = {
'logger': self._log,
'progress_hooks': [self._yt_progress_hook],
'paths': {
'temp': '/tmp',
'home': self.config['download_dir']
},
'writesubtitles': True,
'writeautomaticsub': True,
'writethumbnail': True,
'subtitlesformat': 'srt',
'subtitleslangs': self.config['subtitle_langs'],
'allow_multiple_audio_streams': True,
'merge_output_format': 'mp4',
'format': 'bestvideo+bestaudio[ext=m4a]/best',
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
try:
ydl.download(uri)
except yt_dlp.utils.DownloadError as error:
self._log.error(
"Error getting video. %s",
error
)
break
else:
self._log.error(
"Error! Video ID not found in URI '%s'",
entry['link']
)
def _yt_progress_hook(self, data):
if data['status'] == 'finished':
self.total_count += 1
self.channel_count += 1
if self.total_count == self.config['total_limit']:
self._log.info(
"Limit (%s) reached for videos from all channels",
self.config['total_limit'],
)
sys.exit(0)
if self.channel_count == self.config['channel_limit']:
self._log.info(
"Limit (%s) reached for videos for this channel '%s'",
self.config['channel_limit'],
data['info_dict'].get('channel', '?')
)
# break
info_filename = os.path.join(self.config['download_dir'], f"{os.path.basename(data['filename'])}.download_info.json")
self._log.debug(
"Writting download information in to file '%s'",
info_filename
)
with open(info_filename, 'w', encoding='utf-8') as info_file:
json.dump(data, info_file, indent=2)
if 'id' in data['info_dict']:
self._save_downloaded_items(data['info_dict']['id'])
def _save_downloaded_items(self, video_id):
self.downloaded_items.append(video_id)
with open(self.config['downloaded_database'], 'w', encoding='utf-8') as db_file:
for item in self.downloaded_items:
db_file.write(f"{item}\n")
def _human_time_duration(self, seconds):
'''Return time duration in a human readable formated string'''
if seconds == 0:
return 'inf'
parts = []
for unit, div in self.time_duration_units:
amount, seconds = divmod(int(seconds), div)
if amount > 0:
parts.append(f'{amount} {unit}{"" if amount == 1 else "s"}')
return ', '.join(parts)
def _init_log(self):
''' Initialize log object '''
@ -85,6 +278,50 @@ class GetYoutubeVideos:
@click.option('--log-file', '-l', help="File to store all debug messages.")
# @click.option("--dummy","-n", is_flag=True,
# help="Don't do anything, just show what would be done.")
@click.option(
'--downloaded-database', '-d',
default=f"{os.environ.get('HOME', os.environ.get('USERPROFILE', ''))}/.config/downloaded_youtube_videos",
help='File to store the IDs of downloaded videos'
)
@click.option(
'--download-dir', '-f',
default=f"{os.environ.get('HOME', os.environ.get('USERPROFILE', ''))}/downloaded_youtube_videos",
help='Folder to store the downloaded videos'
)
@click.option(
'--channels', '-c',
multiple=True,
required=True,
help='YouTube channels IDs to look up'
)
@click.option(
'--channel-limit', '-l',
default=5,
type=int,
help='Maximun number of videos to download from a channel'
)
@click.option(
'--total-limit', '-L',
default=5,
type=int,
help='Maximun number of videos to download in total'
)
@click.option(
'--max-length', '-d',
default=5400,
type=int,
help='Maximun duration of videos to download in seconds'
)
@click.option(
'--subtitle-langs', '-s',
multiple=True,
default=['en.*'],
help='''List of languages of the subtitles to download (can be regex).
The list may contain "all" to refer to all the available
subtitles. The language can be prefixed with a "-" to
exclude it from the requested languages, e.g. ['all', '-live_chat'].
And you can use wildcards like en.*'''
)
@click_config_file.configuration_option()
def __main__(**kwargs):
return GetYoutubeVideos(**kwargs)

View file

@ -1,2 +1,5 @@
click
click_config_file
requests
feedparser
yt_dlp