From 63bab306e9aca4d406b07496af25204b3540fbcd Mon Sep 17 00:00:00 2001 From: "Antonio J. Delgado" Date: Fri, 15 Nov 2024 15:34:30 +0200 Subject: [PATCH] add req --- get_youtube_videos/get_youtube_videos.py | 237 +++++++++++++++++++++++ requirements.txt | 5 +- 2 files changed, 241 insertions(+), 1 deletion(-) diff --git a/get_youtube_videos/get_youtube_videos.py b/get_youtube_videos/get_youtube_videos.py index 58bc605..87e720f 100644 --- a/get_youtube_videos/get_youtube_videos.py +++ b/get_youtube_videos/get_youtube_videos.py @@ -7,15 +7,28 @@ import sys import os +import re +import json import logging from logging.handlers import SysLogHandler import click import click_config_file +import requests +import feedparser +import yt_dlp class GetYoutubeVideos: + '''Get YouTube videos from a series of channels''' def __init__(self, **kwargs): + self.time_duration_units = ( + ('week', 60*60*24*7), + ('day', 60*60*24), + ('hour', 60*60), + ('min', 60), + ('sec', 1) + ) self.config = kwargs if 'log_file' not in kwargs or kwargs['log_file'] is None: self.config['log_file'] = os.path.join( @@ -30,6 +43,186 @@ class GetYoutubeVideos: 'get_youtube_videos.log' ) self._init_log() + if os.path.exists(self.config['downloaded_database']): + with open(self.config['downloaded_database'], 'r', encoding='utf-8') as db_file: + self.downloaded_items = db_file.read().split('\n') + else: + self.downloaded_items = [] + self.session = requests.Session() + self._process_channels() + + def _process_channels(self): + self.total_count = 0 + self.channels_count = 0 + for channel in self.config['channels']: + self.channels_count += 1 + self._log.debug( + "Processing channel %s/%s '%s'...", + self.channels_count, + len(self.config['channels']), + channel + ) + feed = feedparser.parse( + f"https://www.youtube.com/feeds/videos.xml?channel_id={channel}" + ) + self.channel_count = 0 + self.entries_count = 0 + for entry in feed['entries']: + self.entries_count += 1 + self._log.debug( + "Processing video entry %s/%s...", + self.entries_count, + len(feed['entries']) + ) + result=re.search('v=([0-9a-zA-Z-_]{11})',entry['link']) + if result: + video_id=result.group(1) + if video_id not in self.downloaded_items: + # print(json.dumps(entry, indent=2)) + uri=f"https://www.youtube.com/watch?v={video_id}" + ydl_opts = { + 'logger': self._log, + 'progress_hooks': [self._yt_progress_hook], + } + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + try: + raw_video_info = ydl.extract_info(uri, download=False) + except yt_dlp.utils.DownloadError as error: + self._log.debug( + "%s Skipping video.", + error + ) + break + video_info = ydl.sanitize_info(raw_video_info) + info_filename = os.path.join(self.config['download_dir'], f"{video_id}.json") + self._log.debug( + "Writting information in to file '%s'", + info_filename + ) + with open(info_filename, 'w', encoding='utf-8') as info_file: + json.dump(video_info, info_file, indent=2) + if video_info['live_status'] == 'is_live': + self._log.debug( + "Skipping video '%s' as it's a live video", + video_info.get('title', '?') + ) + self.downloaded_items.append(video_id) + with open( + self.config['downloaded_database'], 'w', encoding='utf-8' + ) as db_file: + for item in self.downloaded_items: + db_file.write(f"{item}\n") + break + if video_info['was_live']: + self._log.debug( + "Skipping video '%s' as it was a live video", + video_info.get('title', '?') + ) + self._save_downloaded_items(video_id) + break + if ('duration' in video_info and + video_info['duration'] > self.config['max_length']): + self._log.debug( + "Skipping video '%s' as it was larger than %s", + video_info.get('title', '?'), + self._human_time_duration(self.config['max_length']) + ) + self._save_downloaded_items(video_id) + break + if 'duration' not in video_info: + self._log.debug( + "Skipping video '%s' as there is no video duration", + video_info.get('title', '?') + ) + self._save_downloaded_items(video_id) + break + self._log.info( + "Downloading. Filename: '%s'. Video ID: '%s'. Video URL: '%s'. Duration: %s. Counts: %s/%s - %s/%s)", + video_info.get('title', '?'), + video_id, + uri, + self._human_time_duration(video_info.get('duration', '-1')), + self.total_count+1, + self.config['total_limit'], + self.channel_count+1, + self.config['channel_limit'] + ) + ydl_opts = { + 'logger': self._log, + 'progress_hooks': [self._yt_progress_hook], + 'paths': { + 'temp': '/tmp', + 'home': self.config['download_dir'] + }, + 'writesubtitles': True, + 'writeautomaticsub': True, + 'writethumbnail': True, + 'subtitlesformat': 'srt', + 'subtitleslangs': self.config['subtitle_langs'], + 'allow_multiple_audio_streams': True, + 'merge_output_format': 'mp4', + 'format': 'bestvideo+bestaudio[ext=m4a]/best', + } + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + try: + ydl.download(uri) + except yt_dlp.utils.DownloadError as error: + self._log.error( + "Error getting video. %s", + error + ) + break + else: + self._log.error( + "Error! Video ID not found in URI '%s'", + entry['link'] + ) + + def _yt_progress_hook(self, data): + if data['status'] == 'finished': + self.total_count += 1 + self.channel_count += 1 + if self.total_count == self.config['total_limit']: + self._log.info( + "Limit (%s) reached for videos from all channels", + self.config['total_limit'], + ) + sys.exit(0) + if self.channel_count == self.config['channel_limit']: + self._log.info( + "Limit (%s) reached for videos for this channel '%s'", + self.config['channel_limit'], + data['info_dict'].get('channel', '?') + ) + # break + info_filename = os.path.join(self.config['download_dir'], f"{os.path.basename(data['filename'])}.download_info.json") + self._log.debug( + "Writting download information in to file '%s'", + info_filename + ) + with open(info_filename, 'w', encoding='utf-8') as info_file: + json.dump(data, info_file, indent=2) + if 'id' in data['info_dict']: + self._save_downloaded_items(data['info_dict']['id']) + + + def _save_downloaded_items(self, video_id): + self.downloaded_items.append(video_id) + with open(self.config['downloaded_database'], 'w', encoding='utf-8') as db_file: + for item in self.downloaded_items: + db_file.write(f"{item}\n") + + + def _human_time_duration(self, seconds): + '''Return time duration in a human readable formated string''' + if seconds == 0: + return 'inf' + parts = [] + for unit, div in self.time_duration_units: + amount, seconds = divmod(int(seconds), div) + if amount > 0: + parts.append(f'{amount} {unit}{"" if amount == 1 else "s"}') + return ', '.join(parts) def _init_log(self): ''' Initialize log object ''' @@ -85,6 +278,50 @@ class GetYoutubeVideos: @click.option('--log-file', '-l', help="File to store all debug messages.") # @click.option("--dummy","-n", is_flag=True, # help="Don't do anything, just show what would be done.") +@click.option( + '--downloaded-database', '-d', + default=f"{os.environ.get('HOME', os.environ.get('USERPROFILE', ''))}/.config/downloaded_youtube_videos", + help='File to store the IDs of downloaded videos' +) +@click.option( + '--download-dir', '-f', + default=f"{os.environ.get('HOME', os.environ.get('USERPROFILE', ''))}/downloaded_youtube_videos", + help='Folder to store the downloaded videos' +) +@click.option( + '--channels', '-c', + multiple=True, + required=True, + help='YouTube channels IDs to look up' +) +@click.option( + '--channel-limit', '-l', + default=5, + type=int, + help='Maximun number of videos to download from a channel' +) +@click.option( + '--total-limit', '-L', + default=5, + type=int, + help='Maximun number of videos to download in total' +) +@click.option( + '--max-length', '-d', + default=5400, + type=int, + help='Maximun duration of videos to download in seconds' +) +@click.option( + '--subtitle-langs', '-s', + multiple=True, + default=['en.*'], + help='''List of languages of the subtitles to download (can be regex). + The list may contain "all" to refer to all the available + subtitles. The language can be prefixed with a "-" to + exclude it from the requested languages, e.g. ['all', '-live_chat']. + And you can use wildcards like en.*''' +) @click_config_file.configuration_option() def __main__(**kwargs): return GetYoutubeVideos(**kwargs) diff --git a/requirements.txt b/requirements.txt index 66bf966..776dfd5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,5 @@ click -click_config_file \ No newline at end of file +click_config_file +requests +feedparser +yt_dlp