mirror of
https://github.com/redlib-org/redlib-instances.git
synced 2024-11-13 18:06:15 -05:00
893 lines
23 KiB
Bash
Executable File
893 lines
23 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
# generate-instances-json.sh
|
|
#
|
|
# Generate a JSON of Libreddit instances, given a CSV input listing those
|
|
# instances.
|
|
#
|
|
# Information on script options is available by running
|
|
# generate-instances.sh -h
|
|
#
|
|
# For more information on how to use this script, see README.md.
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify it under
|
|
# the terms of the GNU General Public License as published by the Free Software
|
|
# Foundation, either version 3 of the License, or (at your option) any later
|
|
# version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful, but WITHOUT
|
|
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License along with
|
|
# this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
set -o pipefail
|
|
|
|
# Grab today's date.
|
|
TODAY="$(date -I -u)"
|
|
|
|
# List of programs on which this script depends.
|
|
# curl is required in order to make HTTP requests.
|
|
# jq is required for JSON processing.
|
|
DEPENDENCIES=(curl jq)
|
|
|
|
# This is the default User-Agent the script will tell curl to use if the
|
|
# environment variable USER_AGENT is not defined.
|
|
DEFAULT_USER_AGENT="libreddit-instance-updater/0.1"
|
|
|
|
# If USER_AGENT is specified in the envs, we'll pass this argument to curl
|
|
# using the -A flag to set a custom User-Agent.
|
|
USER_AGENT="${USER_AGENT:-${DEFAULT_USER_AGENT}}"
|
|
|
|
# HTTP proxy for connecting to nodes on I2P. This is an environment variable.
|
|
I2P_HTTP_PROXY="${I2P_HTTP_PROXY:-}"
|
|
|
|
# check_tor
|
|
#
|
|
# Returns true if tor is running; false otherwise.
|
|
check_tor ()
|
|
{
|
|
pidof -q tor
|
|
}
|
|
|
|
# check_bin
|
|
#
|
|
# Returns true if the specified program is in PATH; false otherwise.
|
|
check_program ()
|
|
{
|
|
command -v "${1}" >/dev/null
|
|
}
|
|
|
|
# can_tor
|
|
#
|
|
# Returns true if tor is running
|
|
can_tor ()
|
|
{
|
|
check_tor
|
|
}
|
|
|
|
# can_i2p
|
|
#
|
|
# Returns true if an I2P HTTP proxy is specified.
|
|
can_i2p ()
|
|
{
|
|
[[ -n "${I2P_HTTP_PROXY}" ]]
|
|
}
|
|
|
|
# check_dependencies
|
|
#
|
|
# Returns false if a script dependency is missing. If this is the case, each
|
|
# missing dependency will be printed to stdout.
|
|
check_dependencies ()
|
|
{
|
|
local -i rc=0
|
|
|
|
for dep in "${DEPENDENCIES[@]}"
|
|
do
|
|
if ! check_program "${dep}"
|
|
then
|
|
rc=1
|
|
echo "${dep}"
|
|
fi
|
|
done
|
|
|
|
return "${rc}"
|
|
}
|
|
|
|
# read_csv_row [-d DELIMITER] [-v] ROW
|
|
#
|
|
# Reads a row of comma-separated values. Each value is printed as a separate
|
|
# line to stdout. The function prints nothing and returns 1 if the row is
|
|
# malformed, or if no ROW argument was passed to the function.
|
|
#
|
|
# The default delimiter is ','. Option -d can change this delimiter to a
|
|
# different character.
|
|
#
|
|
# Option -v will print "$i: " before each value, where $i starts at 1 and
|
|
# represents the value's position in the row.
|
|
#
|
|
# It is assumed that the total input is a row, which may include \n (if it's
|
|
# in, say, a quoted value).
|
|
#
|
|
# This will increment the value of the global variable POSITION by
|
|
# how many characters has been read.
|
|
read_csv_row ()
|
|
{
|
|
local opt=
|
|
local OPTIND
|
|
local OPTARG
|
|
|
|
local -i i=0
|
|
local -i quote=0
|
|
local -i esc=0
|
|
local -i seen_delim=0
|
|
local row=
|
|
local print_col=n
|
|
local len=
|
|
local char=
|
|
local value=
|
|
local -a values=()
|
|
local delim=,
|
|
|
|
while getopts "d:v" opt
|
|
do
|
|
case "${opt}" in
|
|
d) delim="${OPTARG}" ;;
|
|
v) print_col="y" ;;
|
|
*) ;;
|
|
esac
|
|
done
|
|
shift "$((OPTIND-1))"
|
|
|
|
# Get row from arg.
|
|
row="${1}"
|
|
if [[ -z "${1}" ]]
|
|
then
|
|
return 1
|
|
fi
|
|
|
|
# Process row character by character.
|
|
len="${#row}"
|
|
|
|
value=
|
|
for (( i = 0; i < len; i++ ))
|
|
do
|
|
char="${row:${i}:1}"
|
|
|
|
# "Handle" escapes. Really, it just means writing the escape verbatim
|
|
# into the string. Yes, that includes ". Because this is ultimately
|
|
# going into JSON, and making this a fully-featured CSV reader would
|
|
# be beyond the scope of for what this script is intended.
|
|
if [[ ${esc} -eq 1 ]]
|
|
then
|
|
esc=0
|
|
value+="\\${char}"
|
|
|
|
# Escape handled. Move on to next character.
|
|
continue
|
|
fi
|
|
|
|
# \ triggers escape.
|
|
# shellcheck disable=SC1003
|
|
if [[ "${char}" == '\' ]]
|
|
then
|
|
esc=1
|
|
continue
|
|
fi
|
|
|
|
# A delimiter means the end of the value (assuming we're not in a
|
|
# quote).
|
|
if [[ ${quote} -eq 0 && "${char}" == "${delim}" ]]
|
|
then
|
|
IFS=$'\n' values+=("${value}")
|
|
value=
|
|
seen_delim=1
|
|
continue
|
|
fi
|
|
|
|
# " means the value is quoted, assuming we're not in the middle of an
|
|
# escape.
|
|
if [[ ${esc} -eq 0 && "${char}" == '"' ]]
|
|
then
|
|
quote=$(( (quote + 1) % 2 ))
|
|
|
|
# We don't actually want to include the double quote in the value.
|
|
continue
|
|
fi
|
|
|
|
# This character isn't a delimier, so switch off seen_delim.
|
|
seen_delim=0
|
|
|
|
value+="${char}"
|
|
done
|
|
|
|
# Handle unexpected end of row.
|
|
if [[ ${quote} -eq 1 || ${esc} -eq 1 ]]
|
|
then
|
|
return 1
|
|
fi
|
|
|
|
# Add the final value to the list of values.
|
|
if [[ (${seen_delim} -eq 0 && -n "${value}") || (${seen_delim} -eq 1 && -z "${value}") ]]
|
|
then
|
|
values+=("${value}")
|
|
fi
|
|
|
|
# Print each value in a separate line.
|
|
i=1
|
|
for value in "${values[@]}"
|
|
do
|
|
if [[ "${print_col}" == "y" ]]
|
|
then
|
|
echo -n "${i}: "
|
|
(( i++ ))
|
|
fi
|
|
echo "${value}"
|
|
done
|
|
}
|
|
|
|
# canonicalize_url URL
|
|
#
|
|
# Performs the following transformations of the given URL:
|
|
# -- Converts the string to all-lowercase.
|
|
# -- Removes any trailing slashes, but only if the path is /.
|
|
#
|
|
# Returns 1 if no or a blank URL is provided, or 2 if the string is not a
|
|
# valid url.
|
|
#
|
|
# TODO: Internationalized domain name support. For now, provide the URL in
|
|
# Punycode if needed.
|
|
canonicalize_url ()
|
|
{
|
|
local url=
|
|
|
|
if [[ -z "${1}" ]]
|
|
then
|
|
return 1
|
|
fi
|
|
url="${1}"
|
|
|
|
# Convert URL to lowercase.
|
|
url="${url,,}"
|
|
|
|
# Reject the string if it's not a valid URL.
|
|
if [[ ! "${url}" =~ ^[a-z0-9]+://[a-z0-9\.\-]+/? ]]
|
|
then
|
|
return 2
|
|
fi
|
|
|
|
# Strip leading /, but only if the path is /.
|
|
if [[ "${url#*://*/}" =~ ^/*$ ]]
|
|
then
|
|
while [[ "${url: -1:1}" == "/" ]]
|
|
do
|
|
url="${url:0: -1}"
|
|
done
|
|
fi
|
|
|
|
echo "${url}"
|
|
}
|
|
|
|
# get [-T] URL
|
|
#
|
|
# Makes an HTTP(S) GET equest to the provided URL with curl. The response is
|
|
# written to standard out. get will determine if the URL is an onion site, and,
|
|
# if so, it wrap the curl call with socks proxy. If the URL is a I2P site, and
|
|
# I2P_HTTP_PROXY is non-empty, tell curl to use that as the proxy.
|
|
#
|
|
# The return value is the curl return value, or:
|
|
# 100: no or blank URL provided
|
|
# 101: invalid URL
|
|
# 102: URL is an onion site, but we can't connect to tor
|
|
# 103: non-tor URL has non-https scheme
|
|
# 104: prevented from dialing onion site
|
|
# 105: no I2P proxy provided
|
|
# 106: prevented from dialing I2P site
|
|
#
|
|
# Option -T will cause get to skip an onion site, silently, and 104 will be
|
|
# returned.
|
|
get ()
|
|
{
|
|
local opt=
|
|
local OPTIND
|
|
local OPTARG
|
|
|
|
local no_tor=n
|
|
local no_i2p=n
|
|
local url=
|
|
local url_no_scheme=
|
|
local scheme=
|
|
local zone=
|
|
local -i rc=0
|
|
local -i tries=3
|
|
local -i timeout=30
|
|
local -a curl_cmd=(curl)
|
|
|
|
while getopts "IT" opt
|
|
do
|
|
case "${opt}" in
|
|
I) no_i2p=y ;;
|
|
T) no_tor=y ;;
|
|
*) ;;
|
|
esac
|
|
done
|
|
shift $((OPTIND-1))
|
|
|
|
if [[ -z "${1}" ]]
|
|
then
|
|
return 100
|
|
fi
|
|
url="${1}"
|
|
|
|
# Get the canonical URL.
|
|
url="$(canonicalize_url "${url}")"
|
|
if [[ -z "${url}" ]]
|
|
then
|
|
return 101
|
|
fi
|
|
url_no_scheme="${url#*://}"
|
|
|
|
# Extract the scheme. We only support HTTP or HTTPS. But maybe Libreddit
|
|
# has a future on gopher...
|
|
local scheme="${url%%://*}"
|
|
case "${scheme}" in
|
|
http|https) ;;
|
|
*) return 101 ;;
|
|
esac
|
|
|
|
# Extract the zone.
|
|
zone="$(<<<"${url}" sed -nE 's|^.+://.+\.([^\./]+)/?.*|\1|p')"
|
|
|
|
# Special handling for Onion and I2P sites.
|
|
# - Onion/I2P sites can be either HTTPS or HTTP. But we want to enforce
|
|
# HTTPS on clearnet sites.
|
|
# - Increase curl max-time to 60 seconds.
|
|
if [[ "${zone,,}" == "onion" ]]
|
|
then
|
|
# Don't bother if tor isn't running. But if both are available,
|
|
# make sure we warp curl with socks.
|
|
if [[ "${no_tor}" == "y" ]]
|
|
then
|
|
return 104
|
|
fi
|
|
|
|
if ! can_tor
|
|
then
|
|
return 102
|
|
fi
|
|
|
|
timeout=60
|
|
curl_cmd=(curl --proxy socks5h://localhost:9050)
|
|
elif [[ "${zone,,}" == "i2p" ]]
|
|
then
|
|
if [[ "${no_i2p}" == "y" ]]
|
|
then
|
|
return 106
|
|
fi
|
|
|
|
if ! can_i2p
|
|
then
|
|
return 105
|
|
fi
|
|
|
|
timeout=60
|
|
curl_cmd=(curl -x "${I2P_HTTP_PROXY}")
|
|
elif [[ "${scheme}" != "https" ]]
|
|
then
|
|
return 103
|
|
fi
|
|
|
|
# Use a custom User-Agent if provided.
|
|
if [[ -n "${USER_AGENT?}" ]]
|
|
then
|
|
curl_cmd=("${curl_cmd[@]}" -A "${USER_AGENT}")
|
|
fi
|
|
|
|
# Do the GET. Try up to the number of times specified in the tries variable.
|
|
for (( i = tries; i > 0; i-- ))
|
|
do
|
|
"${curl_cmd[@]}" -m"${timeout}" -fs -- "${scheme}://${url_no_scheme}"
|
|
rc=$?
|
|
|
|
if [[ ${rc} -eq 0 ]]
|
|
then
|
|
return
|
|
fi
|
|
done
|
|
|
|
return ${rc}
|
|
}
|
|
|
|
# create_instance_entry [-I] [-T] URL COUNTRY_CODE [CLOUDFLARE [DESCRIPTION]]
|
|
#
|
|
# Create JSON object for instance. To specify that the instance is behind
|
|
# Cloudflare, simply set the third argument to be true; any other value
|
|
# will be interpreted as false.
|
|
#
|
|
# A description can be specified in the fourth argument (which means that, if
|
|
# you want to specify description for a website for which Cloudflare is
|
|
# _disabled_, set the third argument to ""). If you pass description in,
|
|
# all quotes will need to be escaped, as this will go directly into a
|
|
# JSON string value. (The idea is that read_csv_row will do the appropriate
|
|
# processing of the rows, including escaping characters in the description
|
|
# column and we will then pass those values verbatim into this function.)
|
|
#
|
|
# Option -I/-T will cause get to skip an onion/i2p site, respectively, and 100
|
|
# will be returned.
|
|
create_instance_entry ()
|
|
{
|
|
local cloudflare=n
|
|
local res=
|
|
local version=
|
|
local json=
|
|
local url_type="url"
|
|
local -i rc=0
|
|
local -a get_opts=()
|
|
|
|
local opt=
|
|
local OPTIND
|
|
local OPTARG
|
|
|
|
while getopts "IT" opt
|
|
do
|
|
case "${opt}" in
|
|
I) get_opts+=("-I") ;;
|
|
T) get_opts+=("-T") ;;
|
|
*) ;;
|
|
esac
|
|
done
|
|
shift $((OPTIND-1))
|
|
|
|
local url="${1}"
|
|
local country="${2}"
|
|
local description="${4}"
|
|
|
|
if [[ -z "${url}" || -z "${country}" ]]
|
|
then
|
|
return 1
|
|
fi
|
|
|
|
if [[ "${3}" == "true" ]]
|
|
then
|
|
cloudflare=y
|
|
fi
|
|
|
|
res="$(get "${get_opts[@]}" "${url}")"
|
|
rc=$?
|
|
|
|
if [[ ${rc} -ne 0 ]]
|
|
then
|
|
# 104-6 are returned if we prevented get from connecting to an
|
|
# onion/i2p site. This requires us to return the special code 100.
|
|
if [[ ${rc} -eq 104 || ${rc} -eq 105 || ${rc} -eq 106 ]]
|
|
then
|
|
return 100
|
|
fi
|
|
|
|
return 2
|
|
fi
|
|
|
|
if [[ -z "${res}" ]]
|
|
then
|
|
return 3
|
|
fi
|
|
|
|
# Scrape the version from the site.
|
|
#
|
|
# Future versions of Libreddit may advertise the version in a <meta> tag in
|
|
# <head>, but it doesn't right now.
|
|
version="$(<<<"${res}" sed -nE 's/.*\s+id="version">(v([0-9]+\.){2}[0-9]+).*$/\1/p')"
|
|
if [[ -z "${version}" ]]
|
|
then
|
|
return 4
|
|
fi
|
|
|
|
# Find out if this is an onion/i2p website.
|
|
# Yeah, this is a little lazy and we could do this a bit better.
|
|
for zone in onion i2p
|
|
do
|
|
if [[ "${url,,}" =~ ^https?://[^/]+\.${zone}/?$ ]]
|
|
then
|
|
url_type="${zone}"
|
|
fi
|
|
done
|
|
|
|
# Build JSON.
|
|
json="{"
|
|
json+="$(printf '"%s":"%s"' "${url_type}" "${url}")"
|
|
json+=","
|
|
json+="$(printf '"country":"%s"' "${country}")"
|
|
json+=","
|
|
json+="$(printf '"version":"%s"' "${version}")"
|
|
|
|
if [[ "${cloudflare}" == "y" ]]
|
|
then
|
|
json+=","
|
|
json+="\"cloudflare\":true"
|
|
fi
|
|
|
|
if [[ -n "${description}" ]]
|
|
then
|
|
# DANGER: If the description string isn't properly escaped, the JSON
|
|
# will be malformed!
|
|
json+=","
|
|
json+="$(printf '"description":"%s"' "${description}")"
|
|
fi
|
|
json+="}"
|
|
|
|
echo "${json}"
|
|
}
|
|
|
|
# helpdoc
|
|
#
|
|
# Print usage information to stdout.
|
|
helpdoc ()
|
|
{
|
|
cat <<!
|
|
USAGE
|
|
${BASH_SOURCE[0]} [-I INPUT_JSON] [-T] [-e | -f] [-i INPUT_CSV] [-o OUTPUT_JSON]
|
|
${BASH_SOURCE[0]} -h
|
|
|
|
DESCRIPTION
|
|
Generate a JSON of Libreddit instances, given a CSV file at INPUT_CSV
|
|
listing those instances. If INPUT_CSV is not given, this script will
|
|
read the CSV file from stdin.
|
|
|
|
The INPUT_CSV file must be a file in CSV syntax of the form
|
|
|
|
[url],[country code],[cloudflare enabled],[description]
|
|
|
|
where all four parameters are required (though the description may be
|
|
blank). Except for onion and I2P sites, all URLs MUST be HTTPS.
|
|
|
|
OUTPUT_JSON will be overwritten if it exists. No confirmation will be
|
|
requested from the user.
|
|
|
|
By default:
|
|
|
|
* This script will not attempt to connect to I2P instances. If you want
|
|
this script to consider instances on the I2P network, you will need to
|
|
provide an HTTP proxy in the environment variable I2P_HTTP_PROXY.
|
|
This proxy typically listens at 127.0.0.1:4444.
|
|
|
|
* This script will attempt to connect to instances in the CSV that are on
|
|
Tor, provided that it can (it will check to see if Tor is running).
|
|
If you want to disable connections to these onion sites, provide the
|
|
-T option.
|
|
|
|
* This script will return a non-zero status code when at least one instance
|
|
could not be reached. If you want this script always to return 0 even
|
|
when not all instances could be reached, provide the -e option (this
|
|
script will still return a non-zero code if there was a problem
|
|
constructing the final JSON object or if the file supplied to the -I
|
|
option could not be read).
|
|
|
|
OPTIONS
|
|
-I INPUT_JSON
|
|
Import the list of Libreddit onion and I2P instances from the file
|
|
INPUT_JSON. To use stdin, provide \`-I -\`. Implies -T, and further
|
|
causes the script to ignore the value in I2P_HTTP_PROXY. Note that the
|
|
argument provided to this option CANNOT be the same as the argument
|
|
provided to -i. If the JSON could not be read, the script will exit with
|
|
status code 1, even if -e is provided.
|
|
|
|
-T
|
|
Do not connect to Tor. Onion sites in INPUT_CSV will not be processed.
|
|
Assuming no other failure, the script will still exit with status code
|
|
0.
|
|
|
|
-e
|
|
Always exit with status code 0, even when at least one instance cannot
|
|
be reached, except in the situations where (1) the file in INPUT_JSON
|
|
(see \`-I\`) could not be processed; or (2) the JSON object could not
|
|
be constructed. Cannot be used together with -f.
|
|
|
|
-f
|
|
Force the script to exit, with status code 1, upon the first failure to
|
|
connect to an instance. Normally, the script will continue to build and
|
|
output the JSON even when one or more of the instances could not be
|
|
reached, though the exit code will be non-zero. Cannot be used together
|
|
with -e.
|
|
|
|
-i INPUT_CSV
|
|
Use INPUT_CSV as the input file. To read from stdin (the default
|
|
behavior), either omit this option or provide \`-i -\`. Note that the
|
|
argument provided to this option CANNOT be the same as the argument
|
|
provided to -I.
|
|
|
|
-o OUTPUT_JSON
|
|
Write the results to OUTPUT_JSON. Any existing file will be
|
|
overwritten. To write to stdout (the default behavior), either omit
|
|
this option or provide \`-o -\`.
|
|
|
|
ENVIRONMENT
|
|
|
|
USER_AGENT
|
|
Sets the User-Agent that curl will use when making the GET to each
|
|
website. By default, this script will tell curl to set its User-Agent
|
|
string to "${DEFAULT_USER_AGENT}".
|
|
|
|
I2P_HTTP_PROXY
|
|
HTTP proxy for connecting to the I2P network. This is required in
|
|
order to connect to instances on I2P. If -I is provided, the value in
|
|
this variable is ignored.
|
|
!
|
|
}
|
|
|
|
# main
|
|
#
|
|
# Main function.
|
|
main ()
|
|
{
|
|
local opt=
|
|
local OPTIND
|
|
local OPTARG
|
|
|
|
local nofailrc=n
|
|
local failfast=n
|
|
local do_tor=y
|
|
local do_i2p=y
|
|
local -a get_opts=()
|
|
local -a missing_deps=()
|
|
local import_nonwww_from_file=
|
|
local input_file=/dev/stdin
|
|
local output_file=/dev/stdout
|
|
local -a instance_entries=()
|
|
local -a imported_nonwww=()
|
|
local instance_entry=
|
|
local -i rc=0
|
|
local json_corrupted=n
|
|
|
|
while getopts ":I:Tefhi:o:" opt
|
|
do
|
|
case "${opt}" in
|
|
I) import_nonwww_from_file="${OPTARG}" ;;
|
|
T) do_tor=n ;;
|
|
e) nofailrc=y ;;
|
|
f) failfast=y ;;
|
|
h) helpdoc ; exit ;;
|
|
i)
|
|
input_file="${OPTARG}"
|
|
if [[ -z "${input_file}" ]]
|
|
then
|
|
echo >&2 "-i: Please specify a file."
|
|
fi
|
|
|
|
if [[ "${input_file}" == '-' ]]
|
|
then
|
|
input_file=/dev/stdin
|
|
fi
|
|
;;
|
|
o)
|
|
output_file="${OPTARG}"
|
|
if [[ -z "${output_file}" ]]
|
|
then
|
|
echo >&2 "-o: Please specify a file."
|
|
fi
|
|
|
|
if [[ "${output_file}" == '-' ]]
|
|
then
|
|
output_file=/dev/stdout
|
|
fi
|
|
;;
|
|
\?)
|
|
echo >&2 "-${OPTARG}: invalid option"
|
|
helpdoc
|
|
exit 255
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# -e and -f cannot be used together.
|
|
if [[ "${nofailrc}" == "y" && "${failfast}" == "y" ]]
|
|
then
|
|
echo >&2 "-e and -f canont be used together."
|
|
helpdoc
|
|
exit 255
|
|
fi
|
|
|
|
# Make sure we have necessary dependencies before moving forward.
|
|
# shellcheck disable=SC2207
|
|
IFS=$'\n' missing_deps=($(check_dependencies))
|
|
|
|
if [[ ${#missing_deps} -ne 0 ]]
|
|
then
|
|
{
|
|
echo "Dependencies are missing. Please install them and then try running the script again."
|
|
echo
|
|
echo "Missing dependencies:"
|
|
|
|
for dep in "${missing_deps[@]}"
|
|
do
|
|
echo -e "\t${dep}"
|
|
done
|
|
} >&2
|
|
return 1
|
|
fi
|
|
|
|
# Special handling for -I.
|
|
if [[ -n "${import_nonwww_from_file}" ]]
|
|
then
|
|
# Abort if -I and -i point to the same file.
|
|
if [[ "${import_nonwww_from_file}" == "${input_file}" ]]
|
|
then
|
|
echo >&2 "-I and -i cannot point to the same file."
|
|
echo >&2 "For more information, run: ${BASH_SOURCE[0]} -h"
|
|
return 1
|
|
fi
|
|
|
|
# Set do_tor <- n so that we don't attempt to make tor connections.
|
|
do_tor=n
|
|
|
|
# Do the same for i2p.
|
|
do_i2p=n
|
|
|
|
# Attempt to read in onion instances.
|
|
# shellcheck disable=SC2207
|
|
# (a mapfile would not ideal here since a pipe is required, inducing a
|
|
# subshell, meaning nothing will actually get added to
|
|
# imported_nonwww)
|
|
IFS=$'\n' imported_nonwww=($(jq -Mcer '.instances[] | select(.onion or .i2p)' "${import_nonwww_from_file}"))
|
|
rc=$?
|
|
|
|
if [[ ${rc} -ne 0 ]]
|
|
then
|
|
echo >&2 "Failed to read onion instances from existing JSON file."
|
|
return 1
|
|
fi
|
|
fi
|
|
|
|
# Check to see if we have tor. If we don't, then we will have to import
|
|
# the existing tor instances from the JSON.
|
|
# TODO: For I2P, we will likely have to do something similar.
|
|
if [[ "${do_tor}" == "n" ]] || ! can_tor
|
|
then
|
|
if [[ "${do_tor}" == "y" ]]
|
|
then
|
|
echo >&2 "WARNING: The tor service is not running. Onion sites will not be processed."
|
|
fi
|
|
do_tor="n"
|
|
get_opts+=("-T")
|
|
fi
|
|
|
|
# Don't attempt I2P connections if no proxy was given.
|
|
if ! can_i2p
|
|
then
|
|
do_i2p="n"
|
|
fi
|
|
|
|
if [[ "${do_i2p}" == "n" ]]
|
|
then
|
|
get_opts+=("-I")
|
|
fi
|
|
|
|
if [[ "${input_file}" != "/dev/stdin" ]]
|
|
then
|
|
if [[ ! -e "${input_file}" ]]
|
|
then
|
|
echo >&2 "${input_file}: No such file or directory"
|
|
return 1
|
|
fi
|
|
|
|
if [[ -d "${input_file}" ]]
|
|
then
|
|
echo >&2 "${input_file}: Is a directory"
|
|
return 1
|
|
fi
|
|
fi
|
|
|
|
# Read in the CSV.
|
|
if [[ "${input_file}" == "/dev/stdin" ]]
|
|
then
|
|
echo >&2 "Reading from stdin..."
|
|
fi
|
|
local -a rows=()
|
|
<"${input_file}" mapfile rows
|
|
rc=0
|
|
|
|
if [[ ${rc} -ne 0 ]]
|
|
then
|
|
return ${rc}
|
|
fi
|
|
|
|
# Process the CSV, row by row.
|
|
local -a values=()
|
|
local -a failed=()
|
|
local l=1
|
|
local url=
|
|
for row in "${rows[@]}"
|
|
do
|
|
# shellcheck disable=SC2207
|
|
IFS=$'\n' values=($(read_csv_row "${row}"))
|
|
rc=$?
|
|
|
|
if [[ ${rc} -ne 0 || ${#values[@]} -lt 3 || ${#values[@]} -gt 4 ]]
|
|
then
|
|
echo >&2 "${l}: failed to parse row"
|
|
echo >&2 "Script will now terminate."
|
|
return 2
|
|
fi
|
|
|
|
# Print friendly message to log while processing row.
|
|
url="${values[0]}"
|
|
echo -n >&2 "${url}: "
|
|
|
|
instance_entry="$(IFS=$'\n' create_instance_entry "${get_opts[@]}" "${values[@]}")"
|
|
rc=$?
|
|
|
|
if [[ ${rc} -eq 0 ]]
|
|
then
|
|
IFS=$'\n' instance_entries+=("${instance_entry}")
|
|
echo "OK"
|
|
elif [[ ${rc} -eq 100 ]]
|
|
then
|
|
# rc=100 means the onion site is skipped because we told
|
|
# create_instance_entry to skip the onion site.
|
|
echo "SKIPPED"
|
|
else
|
|
echo "FAILED"
|
|
|
|
if [[ "${failfast}" == "y" ]]
|
|
then
|
|
return 1
|
|
fi
|
|
|
|
failed+=("${url}")
|
|
fi >&2
|
|
|
|
(( l++ ))
|
|
rc=0
|
|
done
|
|
|
|
# Assemble everything into JSON.
|
|
# TODO: see if this can be done in one jq call, without having
|
|
# to pass the list to jq --slurp and then everything to jq.
|
|
printf '{"updated":"%s","instances":%s}' "${TODAY}" "$(IFS=$'\n'
|
|
for instance in "${instance_entries[@]}" "${imported_nonwww[@]}"
|
|
do
|
|
echo "${instance}"
|
|
done | jq -Mcers .
|
|
)" | jq -Mer . >"${output_file}"
|
|
rc=$?
|
|
|
|
if [[ ${rc} -ne 0 ]]
|
|
then
|
|
echo >&2 "There was a problem processing the JSON. The output file may be corrupted."
|
|
json_corrupted=y
|
|
fi
|
|
|
|
if [[ ${#failed[@]} -gt 0 ]]
|
|
then
|
|
{
|
|
echo "The following instances could not be reached:"
|
|
for failed_url in "${failed[@]}"
|
|
do
|
|
echo -e "\t${failed_url}"
|
|
done
|
|
} >&2
|
|
|
|
if [[ "${nofailrc}" == "y" ]]
|
|
then
|
|
# Special case when user provides -e: exit with 0, except if the
|
|
# JSON is corrupted.
|
|
if [[ "${json_corrupted}" == "n" ]]
|
|
then
|
|
return 0
|
|
fi
|
|
else
|
|
# Normal case: return non-zero code on this failure.
|
|
return 1
|
|
fi
|
|
fi
|
|
|
|
# This will be non-zero if the JSON is corrupted.
|
|
return ${rc}
|
|
}
|
|
|
|
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]
|
|
then
|
|
main "${@}"
|
|
exit
|
|
fi
|