redlib-instances/generate-instances-json.sh

893 lines
23 KiB
Bash
Executable File

#!/usr/bin/env bash
# generate-instances-json.sh
#
# Generate a JSON of Libreddit instances, given a CSV input listing those
# instances.
#
# Information on script options is available by running
# generate-instances.sh -h
#
# For more information on how to use this script, see README.md.
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <https://www.gnu.org/licenses/>.
set -o pipefail
# Grab today's date.
TODAY="$(date -I -u)"
# List of programs on which this script depends.
# curl is required in order to make HTTP requests.
# jq is required for JSON processing.
DEPENDENCIES=(curl jq)
# This is the default User-Agent the script will tell curl to use if the
# environment variable USER_AGENT is not defined.
DEFAULT_USER_AGENT="libreddit-instance-updater/0.1"
# If USER_AGENT is specified in the envs, we'll pass this argument to curl
# using the -A flag to set a custom User-Agent.
USER_AGENT="${USER_AGENT:-${DEFAULT_USER_AGENT}}"
# HTTP proxy for connecting to nodes on I2P. This is an environment variable.
I2P_HTTP_PROXY="${I2P_HTTP_PROXY:-}"
# check_tor
#
# Returns true if tor is running; false otherwise.
check_tor ()
{
pidof -q tor
}
# check_bin
#
# Returns true if the specified program is in PATH; false otherwise.
check_program ()
{
command -v "${1}" >/dev/null
}
# can_tor
#
# Returns true if tor is running
can_tor ()
{
check_tor
}
# can_i2p
#
# Returns true if an I2P HTTP proxy is specified.
can_i2p ()
{
[[ -n "${I2P_HTTP_PROXY}" ]]
}
# check_dependencies
#
# Returns false if a script dependency is missing. If this is the case, each
# missing dependency will be printed to stdout.
check_dependencies ()
{
local -i rc=0
for dep in "${DEPENDENCIES[@]}"
do
if ! check_program "${dep}"
then
rc=1
echo "${dep}"
fi
done
return "${rc}"
}
# read_csv_row [-d DELIMITER] [-v] ROW
#
# Reads a row of comma-separated values. Each value is printed as a separate
# line to stdout. The function prints nothing and returns 1 if the row is
# malformed, or if no ROW argument was passed to the function.
#
# The default delimiter is ','. Option -d can change this delimiter to a
# different character.
#
# Option -v will print "$i: " before each value, where $i starts at 1 and
# represents the value's position in the row.
#
# It is assumed that the total input is a row, which may include \n (if it's
# in, say, a quoted value).
#
# This will increment the value of the global variable POSITION by
# how many characters has been read.
read_csv_row ()
{
local opt=
local OPTIND
local OPTARG
local -i i=0
local -i quote=0
local -i esc=0
local -i seen_delim=0
local row=
local print_col=n
local len=
local char=
local value=
local -a values=()
local delim=,
while getopts "d:v" opt
do
case "${opt}" in
d) delim="${OPTARG}" ;;
v) print_col="y" ;;
*) ;;
esac
done
shift "$((OPTIND-1))"
# Get row from arg.
row="${1}"
if [[ -z "${1}" ]]
then
return 1
fi
# Process row character by character.
len="${#row}"
value=
for (( i = 0; i < len; i++ ))
do
char="${row:${i}:1}"
# "Handle" escapes. Really, it just means writing the escape verbatim
# into the string. Yes, that includes ". Because this is ultimately
# going into JSON, and making this a fully-featured CSV reader would
# be beyond the scope of for what this script is intended.
if [[ ${esc} -eq 1 ]]
then
esc=0
value+="\\${char}"
# Escape handled. Move on to next character.
continue
fi
# \ triggers escape.
# shellcheck disable=SC1003
if [[ "${char}" == '\' ]]
then
esc=1
continue
fi
# A delimiter means the end of the value (assuming we're not in a
# quote).
if [[ ${quote} -eq 0 && "${char}" == "${delim}" ]]
then
IFS=$'\n' values+=("${value}")
value=
seen_delim=1
continue
fi
# " means the value is quoted, assuming we're not in the middle of an
# escape.
if [[ ${esc} -eq 0 && "${char}" == '"' ]]
then
quote=$(( (quote + 1) % 2 ))
# We don't actually want to include the double quote in the value.
continue
fi
# This character isn't a delimier, so switch off seen_delim.
seen_delim=0
value+="${char}"
done
# Handle unexpected end of row.
if [[ ${quote} -eq 1 || ${esc} -eq 1 ]]
then
return 1
fi
# Add the final value to the list of values.
if [[ (${seen_delim} -eq 0 && -n "${value}") || (${seen_delim} -eq 1 && -z "${value}") ]]
then
values+=("${value}")
fi
# Print each value in a separate line.
i=1
for value in "${values[@]}"
do
if [[ "${print_col}" == "y" ]]
then
echo -n "${i}: "
(( i++ ))
fi
echo "${value}"
done
}
# canonicalize_url URL
#
# Performs the following transformations of the given URL:
# -- Converts the string to all-lowercase.
# -- Removes any trailing slashes, but only if the path is /.
#
# Returns 1 if no or a blank URL is provided, or 2 if the string is not a
# valid url.
#
# TODO: Internationalized domain name support. For now, provide the URL in
# Punycode if needed.
canonicalize_url ()
{
local url=
if [[ -z "${1}" ]]
then
return 1
fi
url="${1}"
# Convert URL to lowercase.
url="${url,,}"
# Reject the string if it's not a valid URL.
if [[ ! "${url}" =~ ^[a-z0-9]+://[a-z0-9\.\-]+/? ]]
then
return 2
fi
# Strip leading /, but only if the path is /.
if [[ "${url#*://*/}" =~ ^/*$ ]]
then
while [[ "${url: -1:1}" == "/" ]]
do
url="${url:0: -1}"
done
fi
echo "${url}"
}
# get [-T] URL
#
# Makes an HTTP(S) GET equest to the provided URL with curl. The response is
# written to standard out. get will determine if the URL is an onion site, and,
# if so, it wrap the curl call with socks proxy. If the URL is a I2P site, and
# I2P_HTTP_PROXY is non-empty, tell curl to use that as the proxy.
#
# The return value is the curl return value, or:
# 100: no or blank URL provided
# 101: invalid URL
# 102: URL is an onion site, but we can't connect to tor
# 103: non-tor URL has non-https scheme
# 104: prevented from dialing onion site
# 105: no I2P proxy provided
# 106: prevented from dialing I2P site
#
# Option -T will cause get to skip an onion site, silently, and 104 will be
# returned.
get ()
{
local opt=
local OPTIND
local OPTARG
local no_tor=n
local no_i2p=n
local url=
local url_no_scheme=
local scheme=
local zone=
local -i rc=0
local -i tries=3
local -i timeout=30
local -a curl_cmd=(curl)
while getopts "IT" opt
do
case "${opt}" in
I) no_i2p=y ;;
T) no_tor=y ;;
*) ;;
esac
done
shift $((OPTIND-1))
if [[ -z "${1}" ]]
then
return 100
fi
url="${1}"
# Get the canonical URL.
url="$(canonicalize_url "${url}")"
if [[ -z "${url}" ]]
then
return 101
fi
url_no_scheme="${url#*://}"
# Extract the scheme. We only support HTTP or HTTPS. But maybe Libreddit
# has a future on gopher...
local scheme="${url%%://*}"
case "${scheme}" in
http|https) ;;
*) return 101 ;;
esac
# Extract the zone.
zone="$(<<<"${url}" sed -nE 's|^.+://.+\.([^\./]+)/?.*|\1|p')"
# Special handling for Onion and I2P sites.
# - Onion/I2P sites can be either HTTPS or HTTP. But we want to enforce
# HTTPS on clearnet sites.
# - Increase curl max-time to 60 seconds.
if [[ "${zone,,}" == "onion" ]]
then
# Don't bother if tor isn't running. But if both are available,
# make sure we warp curl with socks.
if [[ "${no_tor}" == "y" ]]
then
return 104
fi
if ! can_tor
then
return 102
fi
timeout=60
curl_cmd=(curl --proxy socks5h://localhost:9050)
elif [[ "${zone,,}" == "i2p" ]]
then
if [[ "${no_i2p}" == "y" ]]
then
return 106
fi
if ! can_i2p
then
return 105
fi
timeout=60
curl_cmd=(curl -x "${I2P_HTTP_PROXY}")
elif [[ "${scheme}" != "https" ]]
then
return 103
fi
# Use a custom User-Agent if provided.
if [[ -n "${USER_AGENT?}" ]]
then
curl_cmd=("${curl_cmd[@]}" -A "${USER_AGENT}")
fi
# Do the GET. Try up to the number of times specified in the tries variable.
for (( i = tries; i > 0; i-- ))
do
"${curl_cmd[@]}" -m"${timeout}" -fs -- "${scheme}://${url_no_scheme}"
rc=$?
if [[ ${rc} -eq 0 ]]
then
return
fi
done
return ${rc}
}
# create_instance_entry [-I] [-T] URL COUNTRY_CODE [CLOUDFLARE [DESCRIPTION]]
#
# Create JSON object for instance. To specify that the instance is behind
# Cloudflare, simply set the third argument to be true; any other value
# will be interpreted as false.
#
# A description can be specified in the fourth argument (which means that, if
# you want to specify description for a website for which Cloudflare is
# _disabled_, set the third argument to ""). If you pass description in,
# all quotes will need to be escaped, as this will go directly into a
# JSON string value. (The idea is that read_csv_row will do the appropriate
# processing of the rows, including escaping characters in the description
# column and we will then pass those values verbatim into this function.)
#
# Option -I/-T will cause get to skip an onion/i2p site, respectively, and 100
# will be returned.
create_instance_entry ()
{
local cloudflare=n
local res=
local version=
local json=
local url_type="url"
local -i rc=0
local -a get_opts=()
local opt=
local OPTIND
local OPTARG
while getopts "IT" opt
do
case "${opt}" in
I) get_opts+=("-I") ;;
T) get_opts+=("-T") ;;
*) ;;
esac
done
shift $((OPTIND-1))
local url="${1}"
local country="${2}"
local description="${4}"
if [[ -z "${url}" || -z "${country}" ]]
then
return 1
fi
if [[ "${3}" == "true" ]]
then
cloudflare=y
fi
res="$(get "${get_opts[@]}" "${url}")"
rc=$?
if [[ ${rc} -ne 0 ]]
then
# 104-6 are returned if we prevented get from connecting to an
# onion/i2p site. This requires us to return the special code 100.
if [[ ${rc} -eq 104 || ${rc} -eq 105 || ${rc} -eq 106 ]]
then
return 100
fi
return 2
fi
if [[ -z "${res}" ]]
then
return 3
fi
# Scrape the version from the site.
#
# Future versions of Libreddit may advertise the version in a <meta> tag in
# <head>, but it doesn't right now.
version="$(<<<"${res}" sed -nE 's/.*\s+id="version">(v([0-9]+\.){2}[0-9]+).*$/\1/p')"
if [[ -z "${version}" ]]
then
return 4
fi
# Find out if this is an onion/i2p website.
# Yeah, this is a little lazy and we could do this a bit better.
for zone in onion i2p
do
if [[ "${url,,}" =~ ^https?://[^/]+\.${zone}/?$ ]]
then
url_type="${zone}"
fi
done
# Build JSON.
json="{"
json+="$(printf '"%s":"%s"' "${url_type}" "${url}")"
json+=","
json+="$(printf '"country":"%s"' "${country}")"
json+=","
json+="$(printf '"version":"%s"' "${version}")"
if [[ "${cloudflare}" == "y" ]]
then
json+=","
json+="\"cloudflare\":true"
fi
if [[ -n "${description}" ]]
then
# DANGER: If the description string isn't properly escaped, the JSON
# will be malformed!
json+=","
json+="$(printf '"description":"%s"' "${description}")"
fi
json+="}"
echo "${json}"
}
# helpdoc
#
# Print usage information to stdout.
helpdoc ()
{
cat <<!
USAGE
${BASH_SOURCE[0]} [-I INPUT_JSON] [-T] [-e | -f] [-i INPUT_CSV] [-o OUTPUT_JSON]
${BASH_SOURCE[0]} -h
DESCRIPTION
Generate a JSON of Libreddit instances, given a CSV file at INPUT_CSV
listing those instances. If INPUT_CSV is not given, this script will
read the CSV file from stdin.
The INPUT_CSV file must be a file in CSV syntax of the form
[url],[country code],[cloudflare enabled],[description]
where all four parameters are required (though the description may be
blank). Except for onion and I2P sites, all URLs MUST be HTTPS.
OUTPUT_JSON will be overwritten if it exists. No confirmation will be
requested from the user.
By default:
* This script will not attempt to connect to I2P instances. If you want
this script to consider instances on the I2P network, you will need to
provide an HTTP proxy in the environment variable I2P_HTTP_PROXY.
This proxy typically listens at 127.0.0.1:4444.
* This script will attempt to connect to instances in the CSV that are on
Tor, provided that it can (it will check to see if Tor is running).
If you want to disable connections to these onion sites, provide the
-T option.
* This script will return a non-zero status code when at least one instance
could not be reached. If you want this script always to return 0 even
when not all instances could be reached, provide the -e option (this
script will still return a non-zero code if there was a problem
constructing the final JSON object or if the file supplied to the -I
option could not be read).
OPTIONS
-I INPUT_JSON
Import the list of Libreddit onion and I2P instances from the file
INPUT_JSON. To use stdin, provide \`-I -\`. Implies -T, and further
causes the script to ignore the value in I2P_HTTP_PROXY. Note that the
argument provided to this option CANNOT be the same as the argument
provided to -i. If the JSON could not be read, the script will exit with
status code 1, even if -e is provided.
-T
Do not connect to Tor. Onion sites in INPUT_CSV will not be processed.
Assuming no other failure, the script will still exit with status code
0.
-e
Always exit with status code 0, even when at least one instance cannot
be reached, except in the situations where (1) the file in INPUT_JSON
(see \`-I\`) could not be processed; or (2) the JSON object could not
be constructed. Cannot be used together with -f.
-f
Force the script to exit, with status code 1, upon the first failure to
connect to an instance. Normally, the script will continue to build and
output the JSON even when one or more of the instances could not be
reached, though the exit code will be non-zero. Cannot be used together
with -e.
-i INPUT_CSV
Use INPUT_CSV as the input file. To read from stdin (the default
behavior), either omit this option or provide \`-i -\`. Note that the
argument provided to this option CANNOT be the same as the argument
provided to -I.
-o OUTPUT_JSON
Write the results to OUTPUT_JSON. Any existing file will be
overwritten. To write to stdout (the default behavior), either omit
this option or provide \`-o -\`.
ENVIRONMENT
USER_AGENT
Sets the User-Agent that curl will use when making the GET to each
website. By default, this script will tell curl to set its User-Agent
string to "${DEFAULT_USER_AGENT}".
I2P_HTTP_PROXY
HTTP proxy for connecting to the I2P network. This is required in
order to connect to instances on I2P. If -I is provided, the value in
this variable is ignored.
!
}
# main
#
# Main function.
main ()
{
local opt=
local OPTIND
local OPTARG
local nofailrc=n
local failfast=n
local do_tor=y
local do_i2p=y
local -a get_opts=()
local -a missing_deps=()
local import_nonwww_from_file=
local input_file=/dev/stdin
local output_file=/dev/stdout
local -a instance_entries=()
local -a imported_nonwww=()
local instance_entry=
local -i rc=0
local json_corrupted=n
while getopts ":I:Tefhi:o:" opt
do
case "${opt}" in
I) import_nonwww_from_file="${OPTARG}" ;;
T) do_tor=n ;;
e) nofailrc=y ;;
f) failfast=y ;;
h) helpdoc ; exit ;;
i)
input_file="${OPTARG}"
if [[ -z "${input_file}" ]]
then
echo >&2 "-i: Please specify a file."
fi
if [[ "${input_file}" == '-' ]]
then
input_file=/dev/stdin
fi
;;
o)
output_file="${OPTARG}"
if [[ -z "${output_file}" ]]
then
echo >&2 "-o: Please specify a file."
fi
if [[ "${output_file}" == '-' ]]
then
output_file=/dev/stdout
fi
;;
\?)
echo >&2 "-${OPTARG}: invalid option"
helpdoc
exit 255
;;
esac
done
# -e and -f cannot be used together.
if [[ "${nofailrc}" == "y" && "${failfast}" == "y" ]]
then
echo >&2 "-e and -f canont be used together."
helpdoc
exit 255
fi
# Make sure we have necessary dependencies before moving forward.
# shellcheck disable=SC2207
IFS=$'\n' missing_deps=($(check_dependencies))
if [[ ${#missing_deps} -ne 0 ]]
then
{
echo "Dependencies are missing. Please install them and then try running the script again."
echo
echo "Missing dependencies:"
for dep in "${missing_deps[@]}"
do
echo -e "\t${dep}"
done
} >&2
return 1
fi
# Special handling for -I.
if [[ -n "${import_nonwww_from_file}" ]]
then
# Abort if -I and -i point to the same file.
if [[ "${import_nonwww_from_file}" == "${input_file}" ]]
then
echo >&2 "-I and -i cannot point to the same file."
echo >&2 "For more information, run: ${BASH_SOURCE[0]} -h"
return 1
fi
# Set do_tor <- n so that we don't attempt to make tor connections.
do_tor=n
# Do the same for i2p.
do_i2p=n
# Attempt to read in onion instances.
# shellcheck disable=SC2207
# (a mapfile would not ideal here since a pipe is required, inducing a
# subshell, meaning nothing will actually get added to
# imported_nonwww)
IFS=$'\n' imported_nonwww=($(jq -Mcer '.instances[] | select(.onion or .i2p)' "${import_nonwww_from_file}"))
rc=$?
if [[ ${rc} -ne 0 ]]
then
echo >&2 "Failed to read onion instances from existing JSON file."
return 1
fi
fi
# Check to see if we have tor. If we don't, then we will have to import
# the existing tor instances from the JSON.
# TODO: For I2P, we will likely have to do something similar.
if [[ "${do_tor}" == "n" ]] || ! can_tor
then
if [[ "${do_tor}" == "y" ]]
then
echo >&2 "WARNING: The tor service is not running. Onion sites will not be processed."
fi
do_tor="n"
get_opts+=("-T")
fi
# Don't attempt I2P connections if no proxy was given.
if ! can_i2p
then
do_i2p="n"
fi
if [[ "${do_i2p}" == "n" ]]
then
get_opts+=("-I")
fi
if [[ "${input_file}" != "/dev/stdin" ]]
then
if [[ ! -e "${input_file}" ]]
then
echo >&2 "${input_file}: No such file or directory"
return 1
fi
if [[ -d "${input_file}" ]]
then
echo >&2 "${input_file}: Is a directory"
return 1
fi
fi
# Read in the CSV.
if [[ "${input_file}" == "/dev/stdin" ]]
then
echo >&2 "Reading from stdin..."
fi
local -a rows=()
<"${input_file}" mapfile rows
rc=0
if [[ ${rc} -ne 0 ]]
then
return ${rc}
fi
# Process the CSV, row by row.
local -a values=()
local -a failed=()
local l=1
local url=
for row in "${rows[@]}"
do
# shellcheck disable=SC2207
IFS=$'\n' values=($(read_csv_row "${row}"))
rc=$?
if [[ ${rc} -ne 0 || ${#values[@]} -lt 3 || ${#values[@]} -gt 4 ]]
then
echo >&2 "${l}: failed to parse row"
echo >&2 "Script will now terminate."
return 2
fi
# Print friendly message to log while processing row.
url="${values[0]}"
echo -n >&2 "${url}: "
instance_entry="$(IFS=$'\n' create_instance_entry "${get_opts[@]}" "${values[@]}")"
rc=$?
if [[ ${rc} -eq 0 ]]
then
IFS=$'\n' instance_entries+=("${instance_entry}")
echo "OK"
elif [[ ${rc} -eq 100 ]]
then
# rc=100 means the onion site is skipped because we told
# create_instance_entry to skip the onion site.
echo "SKIPPED"
else
echo "FAILED"
if [[ "${failfast}" == "y" ]]
then
return 1
fi
failed+=("${url}")
fi >&2
(( l++ ))
rc=0
done
# Assemble everything into JSON.
# TODO: see if this can be done in one jq call, without having
# to pass the list to jq --slurp and then everything to jq.
printf '{"updated":"%s","instances":%s}' "${TODAY}" "$(IFS=$'\n'
for instance in "${instance_entries[@]}" "${imported_nonwww[@]}"
do
echo "${instance}"
done | jq -Mcers .
)" | jq -Mer . >"${output_file}"
rc=$?
if [[ ${rc} -ne 0 ]]
then
echo >&2 "There was a problem processing the JSON. The output file may be corrupted."
json_corrupted=y
fi
if [[ ${#failed[@]} -gt 0 ]]
then
{
echo "The following instances could not be reached:"
for failed_url in "${failed[@]}"
do
echo -e "\t${failed_url}"
done
} >&2
if [[ "${nofailrc}" == "y" ]]
then
# Special case when user provides -e: exit with 0, except if the
# JSON is corrupted.
if [[ "${json_corrupted}" == "n" ]]
then
return 0
fi
else
# Normal case: return non-zero code on this failure.
return 1
fi
fi
# This will be non-zero if the JSON is corrupted.
return ${rc}
}
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]
then
main "${@}"
exit
fi