discover-mastodon-servers/discover_mastodon_servers/discover_mastodon_servers.py

382 lines
14 KiB
Python
Raw Normal View History

2024-01-19 14:33:49 +01:00
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
#
# This script is licensed under GNU GPL version 2.0 or above
# (c) 2024 Antonio J. Delgado
"""Discover Mastodon servers by looking at public timelines"""
import sys
import os
import logging
from logging.handlers import SysLogHandler
import sqlite3
import time
import re
import click
import click_config_file
import requests
class DiscoverMastodonServers:
'''Class to Discover Mastodon Servers'''
def __init__(self, **kwargs):
self.config = kwargs
if 'log_file' not in kwargs or kwargs['log_file'] is None:
self.config['log_file'] = os.path.join(
os.environ.get(
'HOME',
os.environ.get(
'USERPROFILE',
os.getcwd()
)
),
'log',
'discover-mastodon-servers.log'
)
self._init_log()
self.session = requests.Session()
self.session.proxies.update({ 'https': self.config['proxy']})
self.conn = sqlite3.connect(self.config['database_file'])
self.read_db()
if len(self.servers) == 0:
self._log.debug("Adding initial server.")
self.servers[self.config['initial_server']] = {
"name": self.config['initial_server'],
"last_update": time.time(),
2024-01-20 18:06:37 +01:00
"private": False,
"peers": True,
"timeline": True,
2024-01-19 14:33:49 +01:00
}
new_servers_count = 1
while new_servers_count > 0:
new_servers_count = self.discover()
2024-01-19 16:42:15 +01:00
def get_timeline(self, server):
'''Get the data of a public timeline for a given server'''
2024-01-27 22:55:22 +01:00
return self.get_path(server, 'api/v1/timelines/public?remote=true&limit=40')
2024-01-19 16:42:15 +01:00
def get_path(self, server, endpoint):
'''Get the data of an endpoint of a server'''
data = None
try:
result = self.session.get(
2024-01-20 18:06:37 +01:00
f"https://{server['name']}/{endpoint}",
2024-01-19 16:42:15 +01:00
timeout=10
)
2024-01-20 18:06:37 +01:00
server['status'] = result.status_code
server['state'] = 'OK'
2024-01-19 16:42:15 +01:00
if result.status_code < 400:
2024-01-19 19:08:07 +01:00
if 'Content-Type' in result.headers:
if 'application/json' in result.headers['Content-Type']:
data = result.json()
if 'error' not in data:
return data
else:
2024-01-20 18:06:37 +01:00
server['state'] = 'Error'
2024-01-19 19:08:07 +01:00
self._log.debug(
2024-01-20 18:06:37 +01:00
"Server '%s' didn't reply with JSON data.", server['name']
2024-01-19 19:08:07 +01:00
)
2024-01-19 16:42:15 +01:00
else:
2024-01-20 18:06:37 +01:00
server['state'] = 'Error'
2024-01-19 16:42:15 +01:00
self._log.debug(
2024-01-19 19:08:07 +01:00
"Server '%s' didn't return Content-Type header. Headers: '%s'. Content returned: '%s'",
2024-01-20 18:06:37 +01:00
server['name'],
2024-01-23 19:51:17 +01:00
result.headers,
2024-01-19 19:08:07 +01:00
result.content
2024-01-19 16:42:15 +01:00
)
else:
2024-01-20 18:06:37 +01:00
server['state'] = 'Error'
2024-01-19 16:42:15 +01:00
self._log.debug(
2024-01-20 18:06:37 +01:00
"Server '%s' returned error code %s.", server['name'], result.status_code
2024-01-19 16:42:15 +01:00
)
except requests.exceptions.ReadTimeout as error:
2024-01-20 18:06:37 +01:00
server['state'] = 'Error'
2024-01-19 16:42:15 +01:00
self._log.warning(
"Server '%s' didn't respond on time. %s",
2024-01-20 18:06:37 +01:00
server['name'],
2024-01-19 16:42:15 +01:00
error
)
except requests.exceptions.SSLError as error:
2024-01-20 18:06:37 +01:00
server['state'] = 'SSL Error'
2024-01-19 16:42:15 +01:00
self._log.warning(
"Server '%s' don't have a valid SSL certificate. %s",
2024-01-20 18:06:37 +01:00
server['name'],
2024-01-19 16:42:15 +01:00
error
)
except requests.exceptions.ConnectionError as error:
2024-01-20 18:06:37 +01:00
server['state'] = 'Error'
2024-01-19 16:42:15 +01:00
self._log.warning(
"Server '%s' connection failed. %s",
2024-01-20 18:06:37 +01:00
server['name'],
error
)
except requests.exceptions.TooManyRedirects as error:
server['state'] = 'Error'
self._log.warning(
"Server '%s' redirected too many times. %s",
server['name'],
2024-01-19 16:42:15 +01:00
error
)
2024-01-23 19:51:17 +01:00
except Exception as error:
server['state'] = 'Error'
self._log.warning(
"Error fetching endpoint '%s' from server '%s'. %s",
endpoint,
server['name'],
error
)
2024-01-19 16:42:15 +01:00
return data
2024-01-19 19:08:07 +01:00
def get_instance_info(self, server):
'''Get all server information'''
2024-01-20 18:06:37 +01:00
instance = self.get_path(server['name'], '/api/v1/instance')
2024-01-19 19:08:07 +01:00
if instance:
2024-01-20 18:06:37 +01:00
server['instance'] = instance
server['directory'] = []
2024-01-19 19:08:07 +01:00
offset=0
2024-01-20 18:06:37 +01:00
while len(server['directory']) == 0:
2024-01-19 19:08:07 +01:00
directory = self.get_path(
server,
f"/api/v1/directory?limit=80&offset={offset}"
)
if directory:
2024-01-20 18:06:37 +01:00
server['directory'] = server['directory'] + directory
2024-01-19 19:08:07 +01:00
offset += 80
2024-01-20 18:06:37 +01:00
def test_banned_server(self, server_name):
2024-01-19 19:08:07 +01:00
'''Check if a server name match agains any banned regular expressions'''
for banned in self.config['regexp_banned_host']:
2024-01-20 18:06:37 +01:00
match = re.search(banned, server_name)
2024-01-19 19:08:07 +01:00
if match:
self._log.debug(
2024-01-19 20:10:49 +01:00
"Regexp '%s' match server '%s', banned.",
2024-01-19 19:08:07 +01:00
banned,
2024-01-20 18:06:37 +01:00
server_name
2024-01-19 19:08:07 +01:00
)
return True
return False
2024-01-19 14:33:49 +01:00
def discover(self):
'''Discover new servers'''
all_servers = []
new_servers_count = 0
2024-01-20 18:06:37 +01:00
for server_name, server in self.servers.items():
all_servers.append(server_name)
if not self.test_banned_server(server_name):
if 'state' not in server:
server['state'] = 'Unknown'
if 'status' not in server:
server['status'] = 0
if 'peers' not in server:
server['peers'] = True
if 'timeline' not in server:
server['timeline'] = True
if not server['private'] and 'Error' not in server['state']:
self._log.debug("Fetching peers of the server '%s'", server_name)
data = self.get_path(server, 'api/v1/instance/peers')
2024-01-19 19:08:07 +01:00
if data:
for new_server in data:
2024-01-19 20:10:49 +01:00
if ((not self.test_banned_server(new_server)) and
(new_server not in self.servers) and
(new_server not in all_servers)):
new_servers_count += 1
self._log.debug(
"Adding new server '%s' from peers",
new_server
)
all_servers.append(new_server)
self.write_record(
{
"name": new_server,
"last_update": time.time(),
2024-01-20 18:06:37 +01:00
"private": False,
"peers": True,
"timeline": True,
2024-01-19 20:10:49 +01:00
}
2024-01-19 19:08:07 +01:00
)
2024-01-20 18:06:37 +01:00
else:
2024-01-23 19:51:17 +01:00
server['peers'] = False
2024-01-20 18:06:37 +01:00
self._log.debug("Fetching public timeline in server '%s'", server_name)
data = self.get_timeline(server)
2024-01-19 19:08:07 +01:00
if data:
for item in data:
if 'uri' in item:
match_server = re.match(r'https?://([^/]*)/', item['uri'])
if match_server:
new_server = match_server.group(1)
2024-01-19 20:10:49 +01:00
if not self.test_banned_server(new_server) and new_server not in all_servers:
2024-01-20 18:06:37 +01:00
new_server_obj = { "name": new_server }
data = self.get_timeline(new_server_obj)
2024-01-19 19:08:07 +01:00
if data:
new_servers_count += 1
self._log.debug(
2024-01-19 20:10:49 +01:00
"Adding new server '%s' from timeline",
2024-01-19 19:08:07 +01:00
new_server
)
all_servers.append(new_server)
2024-01-20 18:06:37 +01:00
new_server_obj['private'] = False
2024-01-19 19:08:07 +01:00
else:
2024-01-20 18:06:37 +01:00
new_server_obj['private'] = True
self.write_record(new_server_obj)
2024-01-19 19:08:07 +01:00
else:
# Item in public timeline don't have an URI
self._log.debug(
"Item don't have URI. %s",
item
)
else:
2024-01-20 18:06:37 +01:00
server['timeline'] = False
2024-01-19 19:08:07 +01:00
self.write_record(server)
2024-01-19 14:33:49 +01:00
return new_servers_count
def write_record(self, record, table='servers'):
'''Write record to a table'''
2024-01-20 18:06:37 +01:00
if 'state' not in record:
record['state'] = 'Unknown'
if 'status' not in record:
record['status'] = 0
if 'peers' not in record:
record['peers'] = True
if 'timeline' not in record:
record['timeline'] = True
2024-01-23 19:51:17 +01:00
if 'last_update' not in record:
record['last_update'] = time.time()
2024-01-19 14:33:49 +01:00
cur = self.conn.cursor()
result_select = cur.execute(f"""
2024-01-20 18:06:37 +01:00
SELECT name FROM {table} WHERE name = '{record['name']}'
2024-01-19 14:33:49 +01:00
""")
if len(result_select.fetchall()) > 0:
self._log.debug('Record exists, updating.')
2024-01-20 18:06:37 +01:00
query = f"UPDATE {table} SET "
count = 0
for key in record.keys():
if count == 0:
query += f"{key} = :{key} "
else:
query += f",{key} = :{key} "
count += 1
query += "WHERE name = :name"
2024-01-19 14:33:49 +01:00
else:
self._log.debug('Record doesn\'t exist, inserting.')
2024-01-20 18:06:37 +01:00
query = f"INSERT INTO {table} VALUES (:" + ",:".join(record.keys()) + ")"
self._log.debug("Writing record '%s'...",
record
2024-01-19 14:33:49 +01:00
)
2024-01-20 18:06:37 +01:00
try:
result_update = cur.execute(query, record)
self._log.debug("Added record %s.", result_update.lastrowid)
except Exception as error:
self._log.error("Error running query '%s' with record '%s'. %s", query, record, error)
sys.exit(1)
2024-01-19 14:33:49 +01:00
cur.close()
self.conn.commit()
def read_db(self):
'''Read database file'''
cur = self.conn.cursor()
2024-01-20 18:06:37 +01:00
query = """CREATE TABLE IF NOT EXISTS servers(
name TEXT PRIMARY KEY,
last_update REAL,
private INT,
peers INT,
timeline INT,
status INT,
state TEXT
)"""
try:
cur.execute(query)
except Exception as error:
self._log.error("Error running query to create table '%s'. %s", query, error)
sys.exit(2)
query = "SELECT * FROM servers ORDER BY last_update DESC"
try:
result_select = cur.execute(query)
except Exception as error:
self._log.error("Error running query to list servers '%s'. %s", query, error)
sys.exit(3)
2024-01-19 14:33:49 +01:00
self.servers = {}
for item in result_select.fetchall():
self.servers[item[0]] = {
"name": item[0],
"last_update": item[1],
"private": item[2]
}
self._log.debug("There are %s servers in the database.", len(self.servers))
self.conn.commit()
def _init_log(self):
''' Initialize log object '''
self._log = logging.getLogger("discover-mastodon-servers")
self._log.setLevel(logging.DEBUG)
sysloghandler = SysLogHandler()
sysloghandler.setLevel(logging.DEBUG)
self._log.addHandler(sysloghandler)
streamhandler = logging.StreamHandler(sys.stdout)
streamhandler.setLevel(
logging.getLevelName(self.config.get("debug_level", 'INFO'))
)
self._log.addHandler(streamhandler)
if 'log_file' in self.config:
log_file = self.config['log_file']
else:
home_folder = os.environ.get(
'HOME', os.environ.get('USERPROFILE', '')
)
log_folder = os.path.join(home_folder, "log")
log_file = os.path.join(log_folder, "discover-mastodon-servers.log")
if not os.path.exists(os.path.dirname(log_file)):
os.mkdir(os.path.dirname(log_file))
filehandler = logging.handlers.RotatingFileHandler(
log_file, maxBytes=102400000
)
# create formatter
formatter = logging.Formatter(
'%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
)
filehandler.setFormatter(formatter)
filehandler.setLevel(logging.DEBUG)
self._log.addHandler(filehandler)
return True
@click.command()
@click.option(
"--debug-level",
"-d",
default="INFO",
type=click.Choice(
["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"],
case_sensitive=False,
),
help='Set the debug level for the standard output.'
)
@click.option('--log-file', '-l', help="File to store all debug messages.")
# @click.option("--dummy","-n", is_flag=True,
# help="Don't do anything, just show what would be done.")
@click.option(
'--initial-server', '-i', default='mastodon.social',
help='First Mastodon server to reach to read public timeline and discover others.'
)
@click.option('--proxy', '-p', help='Proxy URL to use.')
@click.option(
'--database-file', '-d', default='mastodon-servers.db',
help='File with the database of results.'
)
2024-01-19 19:08:07 +01:00
@click.option(
'--regexp-banned-host', '-r', multiple=True,
help='Regular expression for banned host names.'
)
2024-01-19 14:33:49 +01:00
@click_config_file.configuration_option()
def __main__(**kwargs):
return DiscoverMastodonServers(**kwargs)
if __name__ == "__main__":
__main__()