From 63bab306e9aca4d406b07496af25204b3540fbcd Mon Sep 17 00:00:00 2001
From: "Antonio J. Delgado" <ad@susurrando.com>
Date: Fri, 15 Nov 2024 15:34:30 +0200
Subject: [PATCH] add req

---
 get_youtube_videos/get_youtube_videos.py | 237 +++++++++++++++++++++++
 requirements.txt                         |   5 +-
 2 files changed, 241 insertions(+), 1 deletion(-)

diff --git a/get_youtube_videos/get_youtube_videos.py b/get_youtube_videos/get_youtube_videos.py
index 58bc605..87e720f 100644
--- a/get_youtube_videos/get_youtube_videos.py
+++ b/get_youtube_videos/get_youtube_videos.py
@@ -7,15 +7,28 @@
 
 import sys
 import os
+import re
+import json
 import logging
 from logging.handlers import SysLogHandler
 import click
 import click_config_file
+import requests
+import feedparser
+import yt_dlp
 
 
 class GetYoutubeVideos:
+    '''Get YouTube videos from a series of channels'''
 
     def __init__(self, **kwargs):
+        self.time_duration_units = (
+            ('week', 60*60*24*7),
+            ('day', 60*60*24),
+            ('hour', 60*60),
+            ('min', 60),
+            ('sec', 1)
+        )
         self.config = kwargs
         if 'log_file' not in kwargs or kwargs['log_file'] is None:
             self.config['log_file'] = os.path.join(
@@ -30,6 +43,186 @@ class GetYoutubeVideos:
                 'get_youtube_videos.log'
             )
         self._init_log()
+        if os.path.exists(self.config['downloaded_database']):
+            with open(self.config['downloaded_database'], 'r', encoding='utf-8') as db_file:
+                self.downloaded_items = db_file.read().split('\n')
+        else:
+            self.downloaded_items = []
+        self.session = requests.Session()
+        self._process_channels()
+
+    def _process_channels(self):
+        self.total_count = 0
+        self.channels_count = 0
+        for channel in self.config['channels']:
+            self.channels_count += 1
+            self._log.debug(
+                "Processing channel %s/%s '%s'...",
+                self.channels_count,
+                len(self.config['channels']),
+                channel
+            )
+            feed = feedparser.parse(
+                f"https://www.youtube.com/feeds/videos.xml?channel_id={channel}"
+            )
+            self.channel_count = 0
+            self.entries_count = 0
+            for entry in feed['entries']:
+                self.entries_count += 1
+                self._log.debug(
+                    "Processing video entry %s/%s...",
+                    self.entries_count,
+                    len(feed['entries'])
+                )
+                result=re.search('v=([0-9a-zA-Z-_]{11})',entry['link'])
+                if result:
+                    video_id=result.group(1)
+                    if video_id not in  self.downloaded_items:
+                        # print(json.dumps(entry, indent=2))
+                        uri=f"https://www.youtube.com/watch?v={video_id}"
+                        ydl_opts = {
+                            'logger': self._log,
+                            'progress_hooks': [self._yt_progress_hook],
+                        }
+                        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                            try:
+                                raw_video_info = ydl.extract_info(uri, download=False)
+                            except yt_dlp.utils.DownloadError as error:
+                                self._log.debug(
+                                    "%s Skipping video.",
+                                    error
+                                )
+                                break
+                            video_info = ydl.sanitize_info(raw_video_info)
+                            info_filename = os.path.join(self.config['download_dir'], f"{video_id}.json")
+                            self._log.debug(
+                                "Writting information in to file '%s'",
+                                info_filename
+                            )
+                            with open(info_filename, 'w', encoding='utf-8') as info_file:
+                                json.dump(video_info, info_file, indent=2)
+                            if video_info['live_status'] == 'is_live':
+                                self._log.debug(
+                                    "Skipping video '%s' as it's a live video",
+                                    video_info.get('title', '?')
+                                )
+                                self.downloaded_items.append(video_id)
+                                with open(
+                                    self.config['downloaded_database'], 'w', encoding='utf-8'
+                                ) as db_file:
+                                    for item in  self.downloaded_items:
+                                        db_file.write(f"{item}\n")
+                                break
+                            if video_info['was_live']:
+                                self._log.debug(
+                                    "Skipping video '%s' as it was a live video",
+                                    video_info.get('title', '?')
+                                )
+                                self._save_downloaded_items(video_id)
+                                break
+                            if ('duration' in video_info and
+                                video_info['duration'] > self.config['max_length']):
+                                self._log.debug(
+                                    "Skipping video '%s' as it was larger than %s",
+                                    video_info.get('title', '?'),
+                                    self._human_time_duration(self.config['max_length'])
+                                )
+                                self._save_downloaded_items(video_id)
+                                break
+                            if 'duration' not in video_info:
+                                self._log.debug(
+                                    "Skipping video '%s' as there is no video duration",
+                                    video_info.get('title', '?')
+                                )
+                                self._save_downloaded_items(video_id)
+                                break
+                        self._log.info(
+                            "Downloading. Filename: '%s'. Video ID: '%s'. Video URL: '%s'. Duration: %s. Counts: %s/%s - %s/%s)",
+                            video_info.get('title', '?'),
+                            video_id,
+                            uri,
+                            self._human_time_duration(video_info.get('duration', '-1')),
+                            self.total_count+1,
+                            self.config['total_limit'],
+                            self.channel_count+1,
+                            self.config['channel_limit']
+                        )
+                        ydl_opts = {
+                            'logger': self._log,
+                            'progress_hooks': [self._yt_progress_hook],
+                            'paths': {
+                                'temp': '/tmp',
+                                'home': self.config['download_dir']
+                            },
+                            'writesubtitles': True,
+                            'writeautomaticsub': True,
+                            'writethumbnail': True,
+                            'subtitlesformat': 'srt',
+                            'subtitleslangs': self.config['subtitle_langs'],
+                            'allow_multiple_audio_streams': True,
+                            'merge_output_format': 'mp4',
+                            'format': 'bestvideo+bestaudio[ext=m4a]/best',
+                        }
+                        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                            try:
+                                ydl.download(uri)
+                            except yt_dlp.utils.DownloadError as error:
+                                self._log.error(
+                                    "Error getting video. %s",
+                                    error
+                                )
+                                break
+                else:
+                    self._log.error(
+                        "Error! Video ID not found in URI '%s'",
+                        entry['link']
+                    )
+
+    def _yt_progress_hook(self, data):
+        if data['status'] == 'finished':
+            self.total_count += 1
+            self.channel_count += 1
+            if self.total_count == self.config['total_limit']:
+                self._log.info(
+                    "Limit (%s) reached for videos from all channels",
+                    self.config['total_limit'],
+                )
+                sys.exit(0)
+            if self.channel_count == self.config['channel_limit']:
+                self._log.info(
+                    "Limit (%s) reached for videos for this channel '%s'",
+                    self.config['channel_limit'],
+                    data['info_dict'].get('channel', '?')
+                )
+                # break
+            info_filename = os.path.join(self.config['download_dir'], f"{os.path.basename(data['filename'])}.download_info.json")
+            self._log.debug(
+                "Writting download information in to file '%s'",
+                info_filename
+            )
+            with open(info_filename, 'w', encoding='utf-8') as info_file:
+                json.dump(data, info_file, indent=2)
+            if 'id' in data['info_dict']:
+                self._save_downloaded_items(data['info_dict']['id'])
+
+
+    def _save_downloaded_items(self, video_id):
+        self.downloaded_items.append(video_id)
+        with open(self.config['downloaded_database'], 'w', encoding='utf-8') as db_file:
+            for item in  self.downloaded_items:
+                db_file.write(f"{item}\n")
+
+
+    def _human_time_duration(self, seconds):
+        '''Return time duration in a human readable formated string'''
+        if seconds == 0:
+            return 'inf'
+        parts = []
+        for unit, div in self.time_duration_units:
+            amount, seconds = divmod(int(seconds), div)
+            if amount > 0:
+                parts.append(f'{amount} {unit}{"" if amount == 1 else "s"}')
+        return ', '.join(parts)
 
     def _init_log(self):
         ''' Initialize log object '''
@@ -85,6 +278,50 @@ class GetYoutubeVideos:
 @click.option('--log-file', '-l', help="File to store all debug messages.")
 # @click.option("--dummy","-n", is_flag=True,
 # help="Don't do anything, just show what would be done.")
+@click.option(
+    '--downloaded-database', '-d',
+    default=f"{os.environ.get('HOME', os.environ.get('USERPROFILE', ''))}/.config/downloaded_youtube_videos",
+    help='File to store the IDs of downloaded videos'
+)
+@click.option(
+    '--download-dir', '-f',
+    default=f"{os.environ.get('HOME', os.environ.get('USERPROFILE', ''))}/downloaded_youtube_videos",
+    help='Folder to store the downloaded videos'
+)
+@click.option(
+    '--channels', '-c',
+    multiple=True,
+    required=True,
+    help='YouTube channels IDs to look up'
+)
+@click.option(
+    '--channel-limit', '-l',
+    default=5,
+    type=int,
+    help='Maximun number of videos to download from a channel'
+)
+@click.option(
+    '--total-limit', '-L',
+    default=5,
+    type=int,
+    help='Maximun number of videos to download in total'
+)
+@click.option(
+    '--max-length', '-d',
+    default=5400,
+    type=int,
+    help='Maximun duration of videos to download in seconds'
+)
+@click.option(
+    '--subtitle-langs', '-s',
+    multiple=True,
+    default=['en.*'],
+    help='''List of languages of the subtitles to download (can be regex).
+            The list may contain "all" to refer to all the available
+            subtitles. The language can be prefixed with a "-" to
+            exclude it from the requested languages, e.g. ['all', '-live_chat'].
+            And you can use wildcards like en.*'''
+)
 @click_config_file.configuration_option()
 def __main__(**kwargs):
     return GetYoutubeVideos(**kwargs)
diff --git a/requirements.txt b/requirements.txt
index 66bf966..776dfd5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,5 @@
 click
-click_config_file
\ No newline at end of file
+click_config_file
+requests
+feedparser
+yt_dlp