2024-01-19 14:33:49 +01:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*- encoding: utf-8 -*-
|
|
|
|
#
|
|
|
|
# This script is licensed under GNU GPL version 2.0 or above
|
|
|
|
# (c) 2024 Antonio J. Delgado
|
|
|
|
"""Discover Mastodon servers by looking at public timelines"""
|
|
|
|
|
|
|
|
import sys
|
|
|
|
import os
|
|
|
|
import logging
|
|
|
|
from logging.handlers import SysLogHandler
|
|
|
|
import sqlite3
|
|
|
|
import time
|
|
|
|
import re
|
|
|
|
import click
|
|
|
|
import click_config_file
|
|
|
|
import requests
|
|
|
|
|
|
|
|
|
|
|
|
class DiscoverMastodonServers:
|
|
|
|
'''Class to Discover Mastodon Servers'''
|
|
|
|
|
|
|
|
def __init__(self, **kwargs):
|
|
|
|
self.config = kwargs
|
|
|
|
if 'log_file' not in kwargs or kwargs['log_file'] is None:
|
|
|
|
self.config['log_file'] = os.path.join(
|
|
|
|
os.environ.get(
|
|
|
|
'HOME',
|
|
|
|
os.environ.get(
|
|
|
|
'USERPROFILE',
|
|
|
|
os.getcwd()
|
|
|
|
)
|
|
|
|
),
|
|
|
|
'log',
|
|
|
|
'discover-mastodon-servers.log'
|
|
|
|
)
|
|
|
|
self._init_log()
|
|
|
|
self.session = requests.Session()
|
|
|
|
self.session.proxies.update({ 'https': self.config['proxy']})
|
|
|
|
self.conn = sqlite3.connect(self.config['database_file'])
|
|
|
|
self.read_db()
|
|
|
|
if len(self.servers) == 0:
|
|
|
|
self._log.debug("Adding initial server.")
|
|
|
|
self.servers[self.config['initial_server']] = {
|
|
|
|
"name": self.config['initial_server'],
|
|
|
|
"last_update": time.time(),
|
2024-01-20 18:06:37 +01:00
|
|
|
"private": False,
|
|
|
|
"peers": True,
|
|
|
|
"timeline": True,
|
2024-01-19 14:33:49 +01:00
|
|
|
}
|
|
|
|
new_servers_count = 1
|
|
|
|
while new_servers_count > 0:
|
|
|
|
new_servers_count = self.discover()
|
|
|
|
|
2024-01-19 16:42:15 +01:00
|
|
|
def get_timeline(self, server):
|
|
|
|
'''Get the data of a public timeline for a given server'''
|
2024-01-27 22:55:22 +01:00
|
|
|
return self.get_path(server, 'api/v1/timelines/public?remote=true&limit=40')
|
2024-01-19 16:42:15 +01:00
|
|
|
|
|
|
|
def get_path(self, server, endpoint):
|
|
|
|
'''Get the data of an endpoint of a server'''
|
|
|
|
data = None
|
|
|
|
try:
|
|
|
|
result = self.session.get(
|
2024-01-20 18:06:37 +01:00
|
|
|
f"https://{server['name']}/{endpoint}",
|
2024-01-19 16:42:15 +01:00
|
|
|
timeout=10
|
|
|
|
)
|
2024-01-20 18:06:37 +01:00
|
|
|
server['status'] = result.status_code
|
|
|
|
server['state'] = 'OK'
|
2024-01-19 16:42:15 +01:00
|
|
|
if result.status_code < 400:
|
2024-01-19 19:08:07 +01:00
|
|
|
if 'Content-Type' in result.headers:
|
|
|
|
if 'application/json' in result.headers['Content-Type']:
|
|
|
|
data = result.json()
|
|
|
|
if 'error' not in data:
|
|
|
|
return data
|
|
|
|
else:
|
2024-01-20 18:06:37 +01:00
|
|
|
server['state'] = 'Error'
|
2024-01-19 19:08:07 +01:00
|
|
|
self._log.debug(
|
2024-01-20 18:06:37 +01:00
|
|
|
"Server '%s' didn't reply with JSON data.", server['name']
|
2024-01-19 19:08:07 +01:00
|
|
|
)
|
2024-01-19 16:42:15 +01:00
|
|
|
else:
|
2024-01-20 18:06:37 +01:00
|
|
|
server['state'] = 'Error'
|
2024-01-19 16:42:15 +01:00
|
|
|
self._log.debug(
|
2024-01-19 19:08:07 +01:00
|
|
|
"Server '%s' didn't return Content-Type header. Headers: '%s'. Content returned: '%s'",
|
2024-01-20 18:06:37 +01:00
|
|
|
server['name'],
|
2024-01-23 19:51:17 +01:00
|
|
|
result.headers,
|
2024-01-19 19:08:07 +01:00
|
|
|
result.content
|
2024-01-19 16:42:15 +01:00
|
|
|
)
|
|
|
|
else:
|
2024-01-20 18:06:37 +01:00
|
|
|
server['state'] = 'Error'
|
2024-01-19 16:42:15 +01:00
|
|
|
self._log.debug(
|
2024-01-20 18:06:37 +01:00
|
|
|
"Server '%s' returned error code %s.", server['name'], result.status_code
|
2024-01-19 16:42:15 +01:00
|
|
|
)
|
|
|
|
except requests.exceptions.ReadTimeout as error:
|
2024-01-20 18:06:37 +01:00
|
|
|
server['state'] = 'Error'
|
2024-01-19 16:42:15 +01:00
|
|
|
self._log.warning(
|
|
|
|
"Server '%s' didn't respond on time. %s",
|
2024-01-20 18:06:37 +01:00
|
|
|
server['name'],
|
2024-01-19 16:42:15 +01:00
|
|
|
error
|
|
|
|
)
|
|
|
|
except requests.exceptions.SSLError as error:
|
2024-01-20 18:06:37 +01:00
|
|
|
server['state'] = 'SSL Error'
|
2024-01-19 16:42:15 +01:00
|
|
|
self._log.warning(
|
|
|
|
"Server '%s' don't have a valid SSL certificate. %s",
|
2024-01-20 18:06:37 +01:00
|
|
|
server['name'],
|
2024-01-19 16:42:15 +01:00
|
|
|
error
|
|
|
|
)
|
|
|
|
except requests.exceptions.ConnectionError as error:
|
2024-01-20 18:06:37 +01:00
|
|
|
server['state'] = 'Error'
|
2024-01-19 16:42:15 +01:00
|
|
|
self._log.warning(
|
|
|
|
"Server '%s' connection failed. %s",
|
2024-01-20 18:06:37 +01:00
|
|
|
server['name'],
|
|
|
|
error
|
|
|
|
)
|
|
|
|
except requests.exceptions.TooManyRedirects as error:
|
|
|
|
server['state'] = 'Error'
|
|
|
|
self._log.warning(
|
|
|
|
"Server '%s' redirected too many times. %s",
|
|
|
|
server['name'],
|
2024-01-19 16:42:15 +01:00
|
|
|
error
|
|
|
|
)
|
2024-01-23 19:51:17 +01:00
|
|
|
except Exception as error:
|
|
|
|
server['state'] = 'Error'
|
|
|
|
self._log.warning(
|
|
|
|
"Error fetching endpoint '%s' from server '%s'. %s",
|
|
|
|
endpoint,
|
|
|
|
server['name'],
|
|
|
|
error
|
|
|
|
)
|
2024-01-19 16:42:15 +01:00
|
|
|
return data
|
|
|
|
|
2024-01-19 19:08:07 +01:00
|
|
|
def get_instance_info(self, server):
|
|
|
|
'''Get all server information'''
|
2024-01-20 18:06:37 +01:00
|
|
|
instance = self.get_path(server['name'], '/api/v1/instance')
|
2024-01-19 19:08:07 +01:00
|
|
|
if instance:
|
2024-01-20 18:06:37 +01:00
|
|
|
server['instance'] = instance
|
|
|
|
server['directory'] = []
|
2024-01-19 19:08:07 +01:00
|
|
|
offset=0
|
2024-01-20 18:06:37 +01:00
|
|
|
while len(server['directory']) == 0:
|
2024-01-19 19:08:07 +01:00
|
|
|
directory = self.get_path(
|
|
|
|
server,
|
|
|
|
f"/api/v1/directory?limit=80&offset={offset}"
|
|
|
|
)
|
|
|
|
if directory:
|
2024-01-20 18:06:37 +01:00
|
|
|
server['directory'] = server['directory'] + directory
|
2024-01-19 19:08:07 +01:00
|
|
|
offset += 80
|
|
|
|
|
2024-01-20 18:06:37 +01:00
|
|
|
def test_banned_server(self, server_name):
|
2024-01-19 19:08:07 +01:00
|
|
|
'''Check if a server name match agains any banned regular expressions'''
|
|
|
|
for banned in self.config['regexp_banned_host']:
|
2024-01-20 18:06:37 +01:00
|
|
|
match = re.search(banned, server_name)
|
2024-01-19 19:08:07 +01:00
|
|
|
if match:
|
|
|
|
self._log.debug(
|
2024-01-19 20:10:49 +01:00
|
|
|
"Regexp '%s' match server '%s', banned.",
|
2024-01-19 19:08:07 +01:00
|
|
|
banned,
|
2024-01-20 18:06:37 +01:00
|
|
|
server_name
|
2024-01-19 19:08:07 +01:00
|
|
|
)
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
2024-01-19 14:33:49 +01:00
|
|
|
def discover(self):
|
|
|
|
'''Discover new servers'''
|
|
|
|
all_servers = []
|
|
|
|
new_servers_count = 0
|
2024-01-20 18:06:37 +01:00
|
|
|
for server_name, server in self.servers.items():
|
|
|
|
all_servers.append(server_name)
|
|
|
|
if not self.test_banned_server(server_name):
|
|
|
|
if 'state' not in server:
|
|
|
|
server['state'] = 'Unknown'
|
|
|
|
if 'status' not in server:
|
|
|
|
server['status'] = 0
|
|
|
|
if 'peers' not in server:
|
|
|
|
server['peers'] = True
|
|
|
|
if 'timeline' not in server:
|
|
|
|
server['timeline'] = True
|
|
|
|
if not server['private'] and 'Error' not in server['state']:
|
|
|
|
self._log.debug("Fetching peers of the server '%s'", server_name)
|
|
|
|
data = self.get_path(server, 'api/v1/instance/peers')
|
2024-01-19 19:08:07 +01:00
|
|
|
if data:
|
|
|
|
for new_server in data:
|
2024-01-19 20:10:49 +01:00
|
|
|
if ((not self.test_banned_server(new_server)) and
|
|
|
|
(new_server not in self.servers) and
|
|
|
|
(new_server not in all_servers)):
|
|
|
|
new_servers_count += 1
|
|
|
|
self._log.debug(
|
|
|
|
"Adding new server '%s' from peers",
|
|
|
|
new_server
|
|
|
|
)
|
|
|
|
all_servers.append(new_server)
|
|
|
|
self.write_record(
|
|
|
|
{
|
|
|
|
"name": new_server,
|
|
|
|
"last_update": time.time(),
|
2024-01-20 18:06:37 +01:00
|
|
|
"private": False,
|
|
|
|
"peers": True,
|
|
|
|
"timeline": True,
|
2024-01-19 20:10:49 +01:00
|
|
|
}
|
2024-01-19 19:08:07 +01:00
|
|
|
)
|
2024-01-20 18:06:37 +01:00
|
|
|
else:
|
2024-01-23 19:51:17 +01:00
|
|
|
server['peers'] = False
|
2024-01-20 18:06:37 +01:00
|
|
|
self._log.debug("Fetching public timeline in server '%s'", server_name)
|
|
|
|
data = self.get_timeline(server)
|
2024-01-19 19:08:07 +01:00
|
|
|
if data:
|
|
|
|
for item in data:
|
|
|
|
if 'uri' in item:
|
|
|
|
match_server = re.match(r'https?://([^/]*)/', item['uri'])
|
|
|
|
if match_server:
|
|
|
|
new_server = match_server.group(1)
|
2024-01-19 20:10:49 +01:00
|
|
|
if not self.test_banned_server(new_server) and new_server not in all_servers:
|
2024-01-20 18:06:37 +01:00
|
|
|
new_server_obj = { "name": new_server }
|
|
|
|
data = self.get_timeline(new_server_obj)
|
2024-01-19 19:08:07 +01:00
|
|
|
if data:
|
|
|
|
new_servers_count += 1
|
|
|
|
self._log.debug(
|
2024-01-19 20:10:49 +01:00
|
|
|
"Adding new server '%s' from timeline",
|
2024-01-19 19:08:07 +01:00
|
|
|
new_server
|
|
|
|
)
|
|
|
|
all_servers.append(new_server)
|
2024-01-20 18:06:37 +01:00
|
|
|
new_server_obj['private'] = False
|
2024-01-19 19:08:07 +01:00
|
|
|
else:
|
2024-01-20 18:06:37 +01:00
|
|
|
new_server_obj['private'] = True
|
|
|
|
self.write_record(new_server_obj)
|
2024-01-19 19:08:07 +01:00
|
|
|
else:
|
|
|
|
# Item in public timeline don't have an URI
|
|
|
|
self._log.debug(
|
|
|
|
"Item don't have URI. %s",
|
|
|
|
item
|
|
|
|
)
|
|
|
|
else:
|
2024-01-20 18:06:37 +01:00
|
|
|
server['timeline'] = False
|
2024-01-19 19:08:07 +01:00
|
|
|
self.write_record(server)
|
2024-01-19 14:33:49 +01:00
|
|
|
return new_servers_count
|
|
|
|
|
|
|
|
def write_record(self, record, table='servers'):
|
|
|
|
'''Write record to a table'''
|
2024-01-20 18:06:37 +01:00
|
|
|
if 'state' not in record:
|
|
|
|
record['state'] = 'Unknown'
|
|
|
|
if 'status' not in record:
|
|
|
|
record['status'] = 0
|
|
|
|
if 'peers' not in record:
|
|
|
|
record['peers'] = True
|
|
|
|
if 'timeline' not in record:
|
|
|
|
record['timeline'] = True
|
2024-01-23 19:51:17 +01:00
|
|
|
if 'last_update' not in record:
|
|
|
|
record['last_update'] = time.time()
|
2024-01-19 14:33:49 +01:00
|
|
|
cur = self.conn.cursor()
|
|
|
|
result_select = cur.execute(f"""
|
2024-01-20 18:06:37 +01:00
|
|
|
SELECT name FROM {table} WHERE name = '{record['name']}'
|
2024-01-19 14:33:49 +01:00
|
|
|
""")
|
|
|
|
if len(result_select.fetchall()) > 0:
|
|
|
|
self._log.debug('Record exists, updating.')
|
2024-01-20 18:06:37 +01:00
|
|
|
query = f"UPDATE {table} SET "
|
|
|
|
count = 0
|
|
|
|
for key in record.keys():
|
|
|
|
if count == 0:
|
|
|
|
query += f"{key} = :{key} "
|
|
|
|
else:
|
|
|
|
query += f",{key} = :{key} "
|
|
|
|
count += 1
|
|
|
|
query += "WHERE name = :name"
|
2024-01-19 14:33:49 +01:00
|
|
|
else:
|
|
|
|
self._log.debug('Record doesn\'t exist, inserting.')
|
2024-01-20 18:06:37 +01:00
|
|
|
query = f"INSERT INTO {table} VALUES (:" + ",:".join(record.keys()) + ")"
|
|
|
|
self._log.debug("Writing record '%s'...",
|
|
|
|
record
|
2024-01-19 14:33:49 +01:00
|
|
|
)
|
2024-01-20 18:06:37 +01:00
|
|
|
try:
|
|
|
|
result_update = cur.execute(query, record)
|
|
|
|
self._log.debug("Added record %s.", result_update.lastrowid)
|
|
|
|
except Exception as error:
|
|
|
|
self._log.error("Error running query '%s' with record '%s'. %s", query, record, error)
|
|
|
|
sys.exit(1)
|
2024-01-19 14:33:49 +01:00
|
|
|
cur.close()
|
|
|
|
self.conn.commit()
|
|
|
|
|
|
|
|
def read_db(self):
|
|
|
|
'''Read database file'''
|
|
|
|
cur = self.conn.cursor()
|
2024-01-20 18:06:37 +01:00
|
|
|
query = """CREATE TABLE IF NOT EXISTS servers(
|
|
|
|
name TEXT PRIMARY KEY,
|
|
|
|
last_update REAL,
|
|
|
|
private INT,
|
|
|
|
peers INT,
|
|
|
|
timeline INT,
|
|
|
|
status INT,
|
|
|
|
state TEXT
|
|
|
|
)"""
|
|
|
|
try:
|
|
|
|
cur.execute(query)
|
|
|
|
except Exception as error:
|
|
|
|
self._log.error("Error running query to create table '%s'. %s", query, error)
|
|
|
|
sys.exit(2)
|
|
|
|
query = "SELECT * FROM servers ORDER BY last_update DESC"
|
|
|
|
try:
|
|
|
|
result_select = cur.execute(query)
|
|
|
|
except Exception as error:
|
|
|
|
self._log.error("Error running query to list servers '%s'. %s", query, error)
|
|
|
|
sys.exit(3)
|
2024-01-19 14:33:49 +01:00
|
|
|
self.servers = {}
|
|
|
|
for item in result_select.fetchall():
|
|
|
|
self.servers[item[0]] = {
|
|
|
|
"name": item[0],
|
|
|
|
"last_update": item[1],
|
|
|
|
"private": item[2]
|
|
|
|
}
|
|
|
|
self._log.debug("There are %s servers in the database.", len(self.servers))
|
|
|
|
self.conn.commit()
|
|
|
|
|
|
|
|
def _init_log(self):
|
|
|
|
''' Initialize log object '''
|
|
|
|
self._log = logging.getLogger("discover-mastodon-servers")
|
|
|
|
self._log.setLevel(logging.DEBUG)
|
|
|
|
|
|
|
|
sysloghandler = SysLogHandler()
|
|
|
|
sysloghandler.setLevel(logging.DEBUG)
|
|
|
|
self._log.addHandler(sysloghandler)
|
|
|
|
|
|
|
|
streamhandler = logging.StreamHandler(sys.stdout)
|
|
|
|
streamhandler.setLevel(
|
|
|
|
logging.getLevelName(self.config.get("debug_level", 'INFO'))
|
|
|
|
)
|
|
|
|
self._log.addHandler(streamhandler)
|
|
|
|
|
|
|
|
if 'log_file' in self.config:
|
|
|
|
log_file = self.config['log_file']
|
|
|
|
else:
|
|
|
|
home_folder = os.environ.get(
|
|
|
|
'HOME', os.environ.get('USERPROFILE', '')
|
|
|
|
)
|
|
|
|
log_folder = os.path.join(home_folder, "log")
|
|
|
|
log_file = os.path.join(log_folder, "discover-mastodon-servers.log")
|
|
|
|
|
|
|
|
if not os.path.exists(os.path.dirname(log_file)):
|
|
|
|
os.mkdir(os.path.dirname(log_file))
|
|
|
|
|
|
|
|
filehandler = logging.handlers.RotatingFileHandler(
|
|
|
|
log_file, maxBytes=102400000
|
|
|
|
)
|
|
|
|
# create formatter
|
|
|
|
formatter = logging.Formatter(
|
|
|
|
'%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
|
|
|
|
)
|
|
|
|
filehandler.setFormatter(formatter)
|
|
|
|
filehandler.setLevel(logging.DEBUG)
|
|
|
|
self._log.addHandler(filehandler)
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
@click.command()
|
|
|
|
@click.option(
|
|
|
|
"--debug-level",
|
|
|
|
"-d",
|
|
|
|
default="INFO",
|
|
|
|
type=click.Choice(
|
|
|
|
["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"],
|
|
|
|
case_sensitive=False,
|
|
|
|
),
|
|
|
|
help='Set the debug level for the standard output.'
|
|
|
|
)
|
|
|
|
@click.option('--log-file', '-l', help="File to store all debug messages.")
|
|
|
|
# @click.option("--dummy","-n", is_flag=True,
|
|
|
|
# help="Don't do anything, just show what would be done.")
|
|
|
|
@click.option(
|
|
|
|
'--initial-server', '-i', default='mastodon.social',
|
|
|
|
help='First Mastodon server to reach to read public timeline and discover others.'
|
|
|
|
)
|
|
|
|
@click.option('--proxy', '-p', help='Proxy URL to use.')
|
|
|
|
@click.option(
|
|
|
|
'--database-file', '-d', default='mastodon-servers.db',
|
|
|
|
help='File with the database of results.'
|
|
|
|
)
|
2024-01-19 19:08:07 +01:00
|
|
|
@click.option(
|
|
|
|
'--regexp-banned-host', '-r', multiple=True,
|
|
|
|
help='Regular expression for banned host names.'
|
|
|
|
)
|
2024-01-19 14:33:49 +01:00
|
|
|
@click_config_file.configuration_option()
|
|
|
|
def __main__(**kwargs):
|
|
|
|
return DiscoverMastodonServers(**kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
__main__()
|