#!/usr/bin/env bash
# generate-instances-json.sh
#
# Generate a JSON of Libreddit instances, given a CSV input listing those
# instances.
#
# Information on script options is available by running
# generate-instances.sh -h
#
# For more information on how to use this script, see README.md.
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see .
set -o pipefail
# Grab today's date.
TODAY="$(date -I -u)"
# List of programs on which this script depends.
# curl is required in order to make HTTP requests.
# jq is required for JSON processing.
DEPENDENCIES=(curl jq)
# If USER_AGENT is specified in the envs, we'll pass this argument to curl
# using the -A flag to set a custom User-Agent.
USER_AGENT="${USER_AGENT:-}"
# check_tor
#
# Returns true if tor is running; false otherwise.
check_tor ()
{
pidof -q tor
}
# check_bin
#
# Returns true if the specified program is in PATH; false otherwise.
check_program ()
{
command -v "${1}" >/dev/null
}
# can_tor
#
# Returns true if tor is running and torsocks is installed.
can_tor ()
{
check_tor && check_program torsocks
}
# check_dependencies
#
# Returns false if a script dependency is missing. If this is the case, each
# missing dependency will be printed to stdout.
check_dependencies ()
{
local -i rc=0
for dep in "${DEPENDENCIES[@]}"
do
if ! check_program "${dep}"
then
rc=1
echo "${dep}"
fi
done
return "${rc}"
}
# read_csv_row [-d DELIMITER] [-v] ROW
#
# Reads a row of comma-separated values. Each value is printed as a separate
# line to stdout. The function prints nothing and returns 1 if the row is
# malformed, or if no ROW argument was passed to the function.
#
# The default delimiter is ','. Option -d can change this delimiter to a
# different character.
#
# Option -v will print "$i: " before each value, where $i starts at 1 and
# represents the value's position in the row.
#
# It is assumed that the total input is a row, which may include \n (if it's
# in, say, a quoted value).
#
# This will increment the value of the global variable POSITION by
# how many characters has been read.
read_csv_row ()
{
local opt=
local OPTIND
local OPTARG
local -i i=0
local -i quote=0
local -i esc=0
local -i seen_delim=0
local row=
local print_col=n
local len=
local char=
local value=
local -a values=()
local delim=,
while getopts "d:v" opt
do
case "${opt}" in
d) delim="${OPTARG}" ;;
v) print_col="y" ;;
*) ;;
esac
done
shift "$((OPTIND-1))"
# Get row from arg.
row="${1}"
if [[ -z "${1}" ]]
then
return 1
fi
# Process row character by character.
len="${#row}"
value=
for (( i = 0; i < len; i++ ))
do
char="${row:${i}:1}"
# "Handle" escapes. Really, it just means writing the escape verbatim
# into the string. Yes, that includes ". Because this is ultimately
# going into JSON, and making this a fully-featured CSV reader would
# be beyond the scope of for what this script is intended.
if [[ ${esc} -eq 1 ]]
then
esc=0
value+="\\${char}"
# Escape handled. Move on to next character.
continue
fi
# \ triggers escape.
# shellcheck disable=SC1003
if [[ "${char}" == '\' ]]
then
esc=1
continue
fi
# A delimiter means the end of the value (assuming we're not in a
# quote).
if [[ ${quote} -eq 0 && "${char}" == "${delim}" ]]
then
IFS=$'\n' values+=("${value}")
value=
seen_delim=1
continue
fi
# " means the value is quoted, assuming we're not in the middle of an
# escape.
if [[ ${esc} -eq 0 && "${char}" == '"' ]]
then
quote=$(( (quote + 1) % 2 ))
# We don't actually want to include the double quote in the value.
continue
fi
# This character isn't a delimier, so switch off seen_delim.
seen_delim=0
value+="${char}"
done
# Handle unexpected end of row.
if [[ ${quote} -eq 1 || ${esc} -eq 1 ]]
then
return 1
fi
# Add the final value to the list of values.
if [[ (${seen_delim} -eq 0 && -n "${value}") || (${seen_delim} -eq 1 && -z "${value}") ]]
then
values+=("${value}")
fi
# Print each value in a separate line.
i=1
for value in "${values[@]}"
do
if [[ "${print_col}" == "y" ]]
then
echo -n "${i}: "
(( i++ ))
fi
echo "${value}"
done
}
# canonicalize_url URL
#
# Performs the following transformations of the given URL:
# -- Converts the string to all-lowercase.
# -- Removes any trailing slashes, but only if the path is /.
#
# Returns 1 if no or a blank URL is provided, or 2 if the string is not a
# valid url.
#
# TODO: Internationalized domain name support. For now, provide the URL in
# Punycode if needed.
canonicalize_url ()
{
local url=
if [[ -z "${1}" ]]
then
return 1
fi
url="${1}"
# Convert URL to lowercase.
url="${url,,}"
# Reject the string if it's not a valid URL.
if [[ ! "${url}" =~ ^[a-z0-9]+://[a-z0-9\.\-]+/? ]]
then
return 2
fi
# Strip leading /, but only if the path is /.
if [[ "${url#*://*/}" =~ ^/*$ ]]
then
while [[ "${url: -1:1}" == "/" ]]
do
url="${url:0: -1}"
done
fi
echo "${url}"
}
# get [-T] URL
#
# Makes an HTTP(S) GET equest to the provided URL with curl. The response is
# written to standard out. get will determine if the URL is an onion site, and,
# if so, it wrap the curl call with torsocks. The return value is the curl
# return value, or:
# 100: no or blank URL provided
# 101: invalid URL
# 102: URL is an onion site, but we can't connect to tor
# 103: non-tor URL has non-https scheme
# 104: prevented from dialing onion site
#
# Option -T will cause get to skip an onion site, silently, and 104 will be
# returned.
get ()
{
local opt=
local OPTIND
local OPTARG
local no_tor=n
local url=
local url_no_scheme=
local scheme=
local zone=
local -i rc=0
local -i tries=3
local -i timeout=30
local -a curl_cmd=(curl)
while getopts "T" opt
do
case "${opt}" in
T) no_tor=y ;;
*) ;;
esac
done
shift $((OPTIND-1))
if [[ -z "${1}" ]]
then
return 100
fi
url="${1}"
# Get the canonical URL.
url="$(canonicalize_url "${url}")"
if [[ -z "${url}" ]]
then
return 101
fi
url_no_scheme="${url#*://}"
# Extract the scheme. We only support HTTP or HTTPS. But maybe Libreddit
# has a future on gopher...
#
# TODO: support i2p
local scheme="${url%%://*}"
case "${scheme}" in
http|https) ;;
*) return 101 ;;
esac
# Extract the zone.
zone="$(<<<"${url}" sed -nE 's|^.+://.+\.([^\./]+)/?.*|\1|p')"
# Special handling for Onion sites.
# - Don't bother if tor isn't running or we don't have torsocks. But if
# both are available, make sure we warp curl with torsocks.
# - Onion sites can be either HTTPS or HTTP. But we want to enforce
# HTTPS on clearnet sites.
# - Increase curl max-time to 60 seconds.
if [[ "${zone}" == "onion" ]]
then
if [[ "${no_tor}" == "y" ]]
then
return 104
fi
if ! can_tor
then
return 102
fi
timeout=60
curl_cmd=(torsocks curl)
elif [[ "${scheme}" != "https" ]]
then
return 103
fi
# Use a custom User-Agent if provided.
if [[ -n "${USER_AGENT?}" ]]
then
curl_cmd=("${curl_cmd[@]}" -A "${USER_AGENT}")
fi
# Do the GET. Try up to the number of times specified in the tries variable.
for (( i = tries; i > 0; i-- ))
do
"${curl_cmd[@]}" -m"${timeout}" -fs -- "${scheme}://${url_no_scheme}"
rc=$?
if [[ ${rc} -eq 0 ]]
then
return
fi
done
return ${rc}
}
# create_instance_entry [-T] URL COUNTRY_CODE [CLOUDFLARE [DESCRIPTION]]
#
# Create JSON object for instance. To specify that the instance is behind
# Cloudflare, simply set the third argument to be true; any other value
# will be interpreted as false.
#
# A description can be specified in the fourth argument (which means that, if
# you want to specify description for a website for which Cloudflare is
# _disabled_, set the third argument to ""). If you pass description in,
# all quotes will need to be escaped, as this will go directly into a
# JSON string value. (The idea is that read_csv_row will do the appropriate
# processing of the rows, including escaping characters in the description
# column and we will then pass those values verbatim into this function.)
#
# Option -T will cause get to skip an onion site, silently, and 100 will be
# returned.
create_instance_entry ()
{
local cloudflare=n
local res=
local version=
local json=
local url_type="url"
local -i rc=0
local -a get_opts=()
local opt=
local OPTIND
local OPTARG
while getopts "T" opt
do
case "${opt}" in
T) get_opts+=("-T") ;;
*) ;;
esac
done
shift $((OPTIND-1))
local url="${1}"
local country="${2}"
local description="${4}"
if [[ -z "${url}" || -z "${country}" ]]
then
return 1
fi
if [[ "${3}" == "true" ]]
then
cloudflare=y
fi
res="$(get "${get_opts[@]}" "${url}")"
rc=$?
if [[ ${rc} -ne 0 ]]
then
# 104 is returned if we prevented get from connecting to an onion site.
# That requires us to return the special code 100.
if [[ ${rc} -eq 104 ]]
then
return 100
fi
return 2
fi
if [[ -z "${res}" ]]
then
return 3
fi
# There's no good way to get the version apart from a scrape. This might
# not work in early versions of Libreddit, or into the future.
# TODO: previous capture group was ([^\<]+), but I changed this to
# (v([0-9]+\.){2}[0-9]+) under the assumption the version is always a semantic
# version; but this may not be true.
version="$(<<<"${res}" sed -nE 's/.*(v([0-9]+\.){2}[0-9]+).*$/\1/p')"
if [[ -z "${version}" ]]
then
return 4
fi
# Find out if this is an onion website.
# Yeah, this is a little lazy and we could do this a bit better.
if [[ "${url,,}" =~ ^https?://[^/]+\.onion ]]
then
url_type="onion"
fi
# Build JSON.
json="{"
json+="$(printf '"%s":"%s"' "${url_type}" "${url}")"
json+=","
json+="$(printf '"country":"%s"' "${country}")"
json+=","
json+="$(printf '"version":"%s"' "${version}")"
if [[ "${cloudflare}" == "y" ]]
then
json+=","
json+="\"cloudflare\":true"
fi
if [[ -n "${description}" ]]
then
# DANGER: If the description string isn't properly escaped, the JSON will be
# malformed!
json+=","
json+="$(printf '"description":"%s"' "${description}")"
fi
json+="}"
echo "${json}"
}
# NOTES
#
# use jq --slurp to turn mutliple objects into array
#
# load any existing onion sites from json:
# jq -Mcer '.instances[] | select(.onion)' instances-example.json
# helpdoc
#
# Print usage information to stdout.
helpdoc ()
{
cat <&2 "-i: Please specify a file."
fi
if [[ "${input_file}" == '-' ]]
then
input_file=/dev/stdin
fi
;;
o)
output_file="${OPTARG}"
if [[ -z "${output_file}" ]]
then
echo >&2 "-o: Please specify a file."
fi
if [[ "${output_file}" == '-' ]]
then
output_file=/dev/stdout
fi
;;
\?)
echo >&2 "-${OPTARG}: invalid option"
helpdoc
exit 255
;;
esac
done
# Make sure we have necessary dependencies before moving forward.
# shellcheck disable=SC2207
IFS=$'\n' missing_deps=($(check_dependencies))
if [[ ${#missing_deps} -ne 0 ]]
then
{
echo "Dependencies are missing. Please install them and then try running the script again."
echo
echo "Missing dependencies:"
for dep in "${missing_deps[@]}"
do
echo -e "\t${dep}"
done
} >&2
return 1
fi
# Special handling for -I.
if [[ -n "${import_onions_from_file}" ]]
then
# Abort if -I and -i point to the same file.
if [[ "${import_onions_from_file}" == "${input_file}" ]]
then
echo >&2 "-I and -i cannot point to the same file."
echo >&2 "For more information, run: ${BASH_SOURCE[0]} -h"
return 1
fi
# Set do_tor <- n so that we don't attempt to make tor connections.
do_tor=n
# Attempt to read in onion instances.
# shellcheck disable=SC2207
# (mapfile not ideal here since a pipe is required, inducing a
# subshell, meaning nothing will actually get added to
# imported_onions)
IFS=$'\n' imported_onions=($(jq -Mcer '.instances[] | select(.onion)' "${import_onions_from_file}"))
rc=$?
if [[ ${rc} -ne 0 ]]
then
echo >&2 "Failed to read onion instances from existing JSON file."
return 1
fi
fi
# Check to see if we have tor. If we don't, then we will have to import
# the existing tor instances from the JSON.
# TODO: For I2P, we will likely have to do something similar.
if [[ "${do_tor}" == "n" ]] || ! can_tor
then
if [[ "${do_tor}" == "y" ]]
then
echo >&2 "WARNING: Either the tor service is not running or torsocks is not available. Either way, onion sites will not be processed."
fi
do_tor="n"
get_opts+=("-T")
fi
if [[ "${input_file}" != "/dev/stdin" ]]
then
if [[ ! -e "${input_file}" ]]
then
echo >&2 "${input_file}: No such file or directory"
return 1
fi
if [[ -d "${input_file}" ]]
then
echo >&2 "${input_file}: Is a directory"
return 1
fi
fi
# Read in the CSV.
local -a rows=()
<"${input_file}" mapfile rows
rc=0
if [[ ${rc} -ne 0 ]]
then
return ${rc}
fi
# Process the CSV, row by row.
local -a values=()
local -a failed=()
local l=1
local url=
for row in "${rows[@]}"
do
# shellcheck disable=SC2207
IFS=$'\n' values=($(read_csv_row "${row}"))
rc=$?
if [[ ${rc} -ne 0 || ${#values[@]} -lt 3 || ${#values[@]} -gt 4 ]]
then
echo >&2 "${l}: failed to parse row"
echo >&2 "Script will now terminate."
return 2
fi
# Print friendly message to log while processing row.
url="${values[0]}"
echo -n >&2 "${url}: "
instance_entry="$(IFS=$'\n' create_instance_entry "${get_opts[@]}" "${values[@]}")"
rc=$?
if [[ ${rc} -eq 0 ]]
then
IFS=$'\n' instance_entries+=("${instance_entry}")
echo "OK"
elif [[ ${rc} -eq 100 ]]
then
# rc=100 means the onion site is skipped because we told
# create_instance_entry to skip the onion site.
echo "SKIPPED"
else
echo "FAILED"
if [[ "${failfast}" == "y" ]]
then
return 1
fi
failed+=("${url}")
fi >&2
(( l++ ))
rc=0
done
# Assemble everything into JSON.
# TODO: see if this can be done in one jq call, without having
# to pass the list to jq --slurp and then everything to jq.
printf '{"updated":"%s","instances":%s}' "${TODAY}" "$(IFS=$'\n'
for instance in "${instance_entries[@]}" "${imported_onions[@]}"
do
echo "${instance}"
done | jq -Mcers .
)" | jq -Mer . >"${output_file}"
rc=$?
if [[ ${rc} -ne 0 ]]
then
echo >&2 "There was a problem processing the JSON. The output file may be corrupted."
fi
if [[ ${#failed[@]} -gt 0 ]]
then
{
echo "The following instances could not be reached:"
for failed_url in "${failed[@]}"
do
echo -e "\t${failed_url}"
done
} >&2
return 1
fi
return ${rc}
}
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]
then
main "${@}"
exit
fi