From f614cbdcebf4362f107e4d50600cc4fa72dafd44 Mon Sep 17 00:00:00 2001 From: "Antonio J. Delgado" Date: Fri, 19 Jan 2024 20:08:07 +0200 Subject: [PATCH] Add banned hostnames --- .../discover_mastodon_servers.py | 168 +++++++++++------- 1 file changed, 107 insertions(+), 61 deletions(-) diff --git a/discover-mastodon-servers/discover_mastodon_servers.py b/discover-mastodon-servers/discover_mastodon_servers.py index 8763b7c..6225368 100755 --- a/discover-mastodon-servers/discover_mastodon_servers.py +++ b/discover-mastodon-servers/discover_mastodon_servers.py @@ -12,6 +12,7 @@ from logging.handlers import SysLogHandler import sqlite3 import time import re +import json import click import click_config_file import requests @@ -63,13 +64,21 @@ class DiscoverMastodonServers: timeout=10 ) if result.status_code < 400: - if 'application/json' in result.headers['Content-Type']: - data = result.json() - if 'error' not in data: - return data + if 'Content-Type' in result.headers: + if 'application/json' in result.headers['Content-Type']: + data = result.json() + if 'error' not in data: + return data + else: + self._log.debug( + "Server '%s' didn't reply with JSON data.", server + ) else: self._log.debug( - "Server '%s' didn't reply with JSON data.", server + "Server '%s' didn't return Content-Type header. Headers: '%s'. Content returned: '%s'", + server, + json.dumps(result.headers, indent=2), + result.content ) else: self._log.debug( @@ -95,70 +104,103 @@ class DiscoverMastodonServers: ) return data + def get_instance_info(self, server): + '''Get all server information''' + result = {} + instance = self.get_path(server, '/api/v1/instance') + if instance: + result['instance'] = instance + directory = [] + result['directory'] = directory + offset=0 + while len(directory) == 0: + directory = self.get_path( + server, + f"/api/v1/directory?limit=80&offset={offset}" + ) + if directory: + result['directory'] = result['directory'] + directory + offset += 80 + return result + + def test_banned_server(self, server): + '''Check if a server name match agains any banned regular expressions''' + for banned in self.config['regexp_banned_host']: + match = re.search(banned, server) + if match: + self._log.debug( + "Regexp '%s' match server '%s'", + banned, + server + ) + return True + return False + def discover(self): '''Discover new servers''' all_servers = [] new_servers_count = 0 for server in self.servers.items(): all_servers.append(server[0]) - if not server[1]['private']: - self._log.debug("Fetching peers of the server '%s'", server[0]) - data = self.get_path(server[0], 'api/v1/instance/peers') - if data: - for new_server in data: - new_servers_count += 1 - self._log.debug( - "Adding new server '%s'", - new_server - ) - all_servers.append(new_server) - self.write_record( - (new_server, - { - "name": new_server, - "last_update": time.time(), - "private": False - } - ) - ) - self._log.debug("Fetching public timeline in server '%s'", server[0]) - data = self.get_timeline(server[0]) - if data: - for item in data: - if 'uri' in item: - match_server = re.match(r'https?://([^/]*)/', item['uri']) - if match_server: - new_server = match_server.group(1) - if new_server not in all_servers: - data = self.get_timeline(new_server) - if data: - new_servers_count += 1 - self._log.debug( - "Adding new server '%s'", - new_server - ) - all_servers.append(new_server) - private = False - else: - private = True - self.write_record( - (new_server, - { - "name": new_server, - "last_update": time.time(), - "private": private - } - ) - ) - else: - # Item in public timeline don't have an URI + if not self.test_banned_server(server[0]): + if not server[1]['private']: + self._log.debug("Fetching peers of the server '%s'", server[0]) + data = self.get_path(server[0], 'api/v1/instance/peers') + if data: + for new_server in data: + new_servers_count += 1 self._log.debug( - "Item don't have URI. %s", - item + "Adding new server '%s'", + new_server ) - else: - server[1]['private'] = True - self.write_record(server) + all_servers.append(new_server) + self.write_record( + (new_server, + { + "name": new_server, + "last_update": time.time(), + "private": False + } + ) + ) + self._log.debug("Fetching public timeline in server '%s'", server[0]) + data = self.get_timeline(server[0]) + if data: + for item in data: + if 'uri' in item: + match_server = re.match(r'https?://([^/]*)/', item['uri']) + if match_server: + new_server = match_server.group(1) + if new_server not in all_servers: + data = self.get_timeline(new_server) + if data: + new_servers_count += 1 + self._log.debug( + "Adding new server '%s'", + new_server + ) + all_servers.append(new_server) + private = False + else: + private = True + self.write_record( + (new_server, + { + "name": new_server, + "last_update": time.time(), + "private": private + } + ) + ) + else: + # Item in public timeline don't have an URI + self._log.debug( + "Item don't have URI. %s", + item + ) + else: + server[1]['private'] = True + self.write_record(server) return new_servers_count def write_record(self, record, table='servers'): @@ -279,6 +321,10 @@ class DiscoverMastodonServers: '--database-file', '-d', default='mastodon-servers.db', help='File with the database of results.' ) +@click.option( + '--regexp-banned-host', '-r', multiple=True, + help='Regular expression for banned host names.' +) @click_config_file.configuration_option() def __main__(**kwargs): return DiscoverMastodonServers(**kwargs)