redlib-instances/generate-instances-json.sh

#!/usr/bin/env bash

# generate-instances-json.sh
#
# Generate a JSON of Libreddit instances, given a CSV input listing those
# instances.
#
# Information on script options is available by running
#     generate-instances.sh -h
#
# For more information on how to use this script, see README.md.
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <https://www.gnu.org/licenses/>.

set -o pipefail

# Grab today's date.
TODAY="$(date -I -u)"

# List of programs on which this script depends.
# curl is required in order to make HTTP requests.
# jq is required for JSON processing.
DEPENDENCIES=(curl jq)

# If USER_AGENT is specified in the envs, we'll pass this argument to curl
# using the -A flag to set a custom User-Agent.
USER_AGENT="${USER_AGENT:-}"

# check_tor
#
# Returns true if tor is running; false otherwise.
check_tor ()
{
    pidof -q tor
}

# check_bin
#
# Returns true if the specified program is in PATH; false otherwise.
check_program ()
{
    command -v "${1}" >/dev/null
}

# can_tor
#
# Returns true if tor is running and torsocks is installed.
can_tor ()
{
    check_tor && check_program torsocks
}

# check_dependencies
#
# Returns false if a script dependency is missing. If this is the case, each
# missing dependency will be printed to stdout.
check_dependencies ()
{
    local -i rc=0

    for dep in "${DEPENDENCIES[@]}"
    do
        if ! check_program "${dep}"
        then
            rc=1
            echo "${dep}"
        fi
    done

    return "${rc}"
}

# read_csv_row [-d DELIMITER] [-v] ROW
#
# Reads a row of comma-separated values. Each value is printed as a separate
# line to stdout. The function prints nothing and returns 1 if the row is
# malformed, or if no ROW argument was passed to the function.
#
# The default delimiter is ','. Option -d can change this delimiter to a
# different character.
#
# Option -v will print "$i: " before each value, where $i starts at 1 and
# represents the value's position in the row.
#
# It is assumed that the total input is a row, which may include \n (if it's
# in, say, a quoted value).
#
# This will increment the value of the global variable POSITION by
# how many characters has been read.
read_csv_row ()
{
    local opt=
    local OPTIND
    local OPTARG

    local -i i=0
    local -i quote=0
    local -i esc=0
    local -i seen_delim=0
    local row=
    local print_col=n
    local len=
    local char=
    local value=
    local -a values=()
    local delim=,

    while getopts "d:v" opt
    do
        case "${opt}" in
        d) delim="${OPTARG}" ;;
        v) print_col="y" ;;
        *) ;;
        esac
    done
    shift "$((OPTIND-1))"

    # Get row from arg.
    row="${1}"
    if [[ -z "${1}" ]]
    then
        return 1
    fi

    # Process row character by character.
    len="${#row}"

    value=
    for (( i = 0; i < len; i++ ))
    do
        char="${row:${i}:1}"

        # "Handle" escapes. Really, it just means writing the escape verbatim
        # into the string. Yes, that includes ". Because this is ultimately
        # going into JSON, and making this a fully-featured CSV reader would
        # be beyond the scope of for what this script is intended.
        if [[ ${esc} -eq 1 ]]
        then
            esc=0
            value+="\\${char}"

            # Escape handled. Move on to next character.
            continue
        fi

        # \ triggers escape.
        # shellcheck disable=SC1003
        if [[ "${char}" == '\' ]]
        then
            esc=1
            continue
        fi

        # A delimiter means the end of the value (assuming we're not in a
        # quote).
        if [[ ${quote} -eq 0 && "${char}" == "${delim}" ]]
        then
            IFS=$'\n' values+=("${value}")
            value=
            seen_delim=1
            continue
        fi

        # " means the value is quoted, assuming we're not in the middle of an
        # escape.
        if [[ ${esc} -eq 0 && "${char}" == '"' ]]
        then
            quote=$(( (quote + 1) % 2 ))

            # We don't actually want to include the double quote in the value.
            continue
        fi

        # This character isn't a delimier, so switch off seen_delim.
        seen_delim=0

        value+="${char}"
    done

    # Handle unexpected end of row.
    if [[ ${quote} -eq 1 || ${esc} -eq 1 ]]
    then
        return 1
    fi

    # Add the final value to the list of values.
    if [[ (${seen_delim} -eq 0 && -n "${value}") || (${seen_delim} -eq 1 && -z "${value}") ]]
    then
        values+=("${value}")
    fi

    # Print each value in a separate line.
    i=1
    for value in "${values[@]}"
    do
        if [[ "${print_col}" == "y" ]]
        then
            echo -n "${i}: "
            (( i++ ))
        fi
        echo "${value}"
    done
}

# canonicalize_url URL
#
# Performs the following transformations of the given URL:
#     -- Converts the string to all-lowercase.
#     -- Removes any trailing slashes, but only if the path is /.
#
# Returns 1 if no or a blank URL is provided, or 2 if the string is not a
# valid url.
#
# TODO: Internationalized domain name support. For now, provide the URL in
# Punycode if needed.
canonicalize_url ()
{
    local url=

    if [[ -z "${1}" ]]
    then
        return 1
    fi
    url="${1}"

    # Convert URL to lowercase.
    url="${url,,}"

    # Reject the string if it's not a valid URL.
    if [[ ! "${url}" =~ ^[a-z0-9]+://[a-z0-9\.\-]+/? ]]
    then
        return 2
    fi

    # Strip leading /, but only if the path is /.
    if [[ "${url#*://*/}" =~ ^/*$ ]]
    then
        while [[ "${url: -1:1}" == "/" ]]
        do
            url="${url:0: -1}"
        done
    fi

    echo "${url}"
}

# get [-T] URL
#
# Makes an HTTP(S) GET equest to the provided URL with curl. The response is
# written to standard out. get will determine if the URL is an onion site, and,
# if so, it wrap the curl call with torsocks. The return value is the curl
# return value, or:
#     100: no or blank URL provided
#     101: invalid URL
#     102: URL is an onion site, but we can't connect to tor
#     103: non-tor URL has non-https scheme
#     104: prevented from dialing onion site
#
# Option -T will cause get to skip an onion site, silently, and 104 will be
# returned.
get ()
{
    local opt=
    local OPTIND
    local OPTARG

    local no_tor=n
    local url=
    local url_no_scheme=
    local scheme=
    local zone=
    local -i rc=0
    local -i tries=3
    local -i timeout=30
    local -a curl_cmd=(curl)

    while getopts "T" opt
    do
        case "${opt}" in
        T) no_tor=y ;;
        *) ;;
        esac
    done
    shift $((OPTIND-1))

    if [[ -z "${1}" ]]
    then
        return 100
    fi
    url="${1}"

    # Get the canonical URL.
    url="$(canonicalize_url "${url}")"
    if [[ -z "${url}" ]]
    then
        return 101
    fi
    url_no_scheme="${url#*://}"

    # Extract the scheme. We only support HTTP or HTTPS. But maybe Libreddit
    # has a future on gopher...
    #
    # TODO: support i2p
    local scheme="${url%%://*}"
    case "${scheme}" in
    http|https) ;;
    *) return 101 ;;
    esac

    # Extract the zone.
    zone="$(<<<"${url}" sed -nE 's|^.+://.+\.([^\./]+)/?.*|\1|p')"

    # Special handling for Onion sites.
    #  - Don't bother if tor isn't running or we don't have torsocks. But if
    #    both are available, make sure we warp curl with torsocks.
    #  - Onion sites can be either HTTPS or HTTP. But we want to enforce
    #    HTTPS on clearnet sites.
    #  - Increase curl max-time to 60 seconds.
    if [[ "${zone}" == "onion" ]]
    then
        if [[ "${no_tor}" == "y" ]]
        then
            return 104
        fi

        if ! can_tor
        then
            return 102
        fi

        timeout=60
        curl_cmd=(torsocks curl)
    elif [[ "${scheme}" != "https" ]]
    then
        return 103
    fi

    # Use a custom User-Agent if provided.
    if [[ -n "${USER_AGENT?}" ]]
    then
        curl_cmd=("${curl_cmd[@]}" -A "${USER_AGENT}")
    fi

    # Do the GET. Try up to the number of times specified in the tries variable.
    for (( i = tries; i > 0; i-- ))
    do
        "${curl_cmd[@]}" -m"${timeout}" -fsL -- "${scheme}://${url_no_scheme}"
        rc=$?

        if [[ ${rc} -eq 0 ]]
        then
            return
        fi
    done

    return ${rc}
}

# create_instance_entry [-T] URL COUNTRY_CODE [CLOUDFLARE [DESCRIPTION]]
#
# Create JSON object for instance. To specify that the instance is behind
# Cloudflare, simply set the third argument to be true; any other value
# will be interpreted as false.
#
# A description can be specified in the fourth argument (which means that, if
# you want to specify description for a website for which Cloudflare is
# _disabled_, set the third argument to ""). If you pass description in,
# all quotes will need to be escaped, as this will go directly into a
# JSON string value. (The idea is that read_csv_row will do the appropriate
# processing of the rows, including escaping characters in the description
# column and we will then pass those values verbatim into this function.)
#
# Option -T will cause get to skip an onion site, silently, and 100 will be
# returned.
create_instance_entry ()
{
    local cloudflare=n
    local res=
    local version=
    local json=
    local url_type="url"
    local -i rc=0
    local -a get_opts=()

    local opt=
    local OPTIND
    local OPTARG

    while getopts "T" opt
    do
        case "${opt}" in
        T) get_opts+=("-T") ;;
        *) ;;
        esac
    done
    shift $((OPTIND-1))

    local url="${1}"
    local country="${2}"
    local description="${4}"

    if [[ -z "${url}" || -z "${country}" ]]
    then
        return 1
    fi

    if [[ "${3}" == "true" ]]
    then
        cloudflare=y
    fi

    res="$(get "${get_opts[@]}" "${url}")"
    rc=$?

    if [[ ${rc} -ne 0 ]]
    then
        # 104 is returned if we prevented get from connecting to an onion site.
        # That requires us to return the special code 100.
        if [[ ${rc} -eq 104 ]]
        then
            return 100
        fi

        return 2
    fi

    if [[ -z "${res}" ]]
    then
        return 3
    fi

    # There's no good way to get the version apart from a scrape. This might
    # not work in early versions of Libreddit, or into the future.
    # TODO: previous capture group was ([^\<]+), but I changed this to
    # (v([0-9]+\.){2}[0-9]+) under the assumption the version is always a semantic
    # version; but this may not be true.
    version="$(<<<"${res}" sed -nE 's/.*<span\s+id="version">(v([0-9]+\.){2}[0-9]+).*$/\1/p')"
    if [[ -z "${version}" ]]
    then
        return 4
    fi

    # Find out if this is an onion website.
    # Yeah, this is a little lazy and we could do this a bit better.
    if [[ "${url,,}" =~ ^https?://[^/]+\.onion ]]
    then
        url_type="onion"
    fi

    # Build JSON.
    json="{"
    json+="$(printf '"%s":"%s"' "${url_type}" "${url}")"
    json+=","
    json+="$(printf '"country":"%s"' "${country}")"
    json+=","
    json+="$(printf '"version":"%s"' "${version}")"

    if [[ "${cloudflare}" == "y" ]]
    then
        json+=","
        json+="\"cloudflare\":true"
    fi

    if [[ -n "${description}" ]]
    then
        # DANGER: If the description string isn't properly escaped, the JSON will be
        # malformed!
        json+=","
        json+="$(printf '"description":"%s"' "${description}")"
    fi
    json+="}"

    echo "${json}"
}

# NOTES
#
# use jq --slurp to turn mutliple objects into array
#
# load any existing onion sites from json:
# jq -Mcer '.instances[] | select(.onion)' instances-example.json

# helpdoc
#
# Print usage information to stdout.
helpdoc ()
{
    cat <<!
USAGE
    ${BASH_SOURCE[0]} [-I INPUT_JSON] [-T] [-f] [-i INPUT_CSV] [-o OUTPUT_JSON]
    ${BASH_SOURCE[0]} -h

DESCRIPTION
    Generate a JSON of Libreddit instances, given a CSV input listing those
    instances.

    The INPUT_CSV file must be a file in CSV syntax of the form

        [url],[country code],[cloudflare enabled],[description]

    where all four parameters are required (though the description may be
    blank). Except for onion sites, all URLs MUST be HTTPS.

    OUTPUT_JSON will be overwritten if it exists. No confirmation will be
    requested from the user.

    By default, this script will attempt to connect to instances in the CSV
    that are on Tor, provided that it can (it will check to see if Tor is
    running and the availability of the torsocks program). If you want to
    disable connections to these onion sites, provide the -T option.

OPTIONS
    -I INPUT_JSON
        Import the list of Libreddit onion instances from the file INPUT_JSON.
        To use stdin, provide \`-I -\`. Implies -T. Note that the argument
        provided to this option CANNOT be the same as the argument provided to
        -i. If the JSON could not be read, the script will exit with status
        code 1.

    -T
        Do not connect to Tor. Onion sites in INPUT_CSV will not be processed.
        Assuming no other failure, the script will still exit with status code
        0.

    -f
        Force the script to exit, with status code 1, upon the first failure to
        connect to an instance. Normally, the script will continue to build and
        output the JSON even when one or more of the instances could not be
        reached, though the exit code will be non-zero.

    -i INPUT_CSV
        Use INPUT_CSV as the input file. To read from stdin (the default
        behavior), either omit this option or provide \`-i -\`. Note that the
        argument provided to this option CANNOT be the same as the argument
        provided to -I.

    -o OUTPUT_JSON
        Write the results to OUTPUT_JSON. Any existing file will be
        overwritten. To write to stdout (the default behavior), either omit
        this option or provide \`-o -\`.

ENVIRONMENT

    USER_AGENT
        Sets the User-Agent that curl will use when making the GET to each website.
!
}

# main
#
# Main function.
main ()
{
    local opt=
    local OPTIND
    local OPTARG

    local failfast=n
    local do_tor=y
    local -a get_opts=()
    local -a missing_deps=()
    local import_onions_from_file=
    local input_file=/dev/stdin
    local output_file=/dev/stdout
    local -a instance_entries=()
    local -a imported_onions=()
    local instance_entry=
    local -i rc=0

    while getopts ":I:Tfhi:o:" opt
    do
        case "${opt}" in
        I) import_onions_from_file="${OPTARG}" ;;
        T) do_tor=n ;;
        f) failfast=y ;;
        h) helpdoc ; exit ;;
        i)
            input_file="${OPTARG}"
            if [[ -z "${input_file}" ]]
            then
                echo >&2 "-i: Please specify a file."
            fi

            if [[ "${input_file}" == '-' ]]
            then
                input_file=/dev/stdin
            fi
            ;;
        o)
            output_file="${OPTARG}"
            if [[ -z "${output_file}" ]]
            then
                echo >&2 "-o: Please specify a file."
            fi

            if [[ "${output_file}" == '-' ]]
            then
                output_file=/dev/stdout
            fi
            ;;
        \?)
            echo >&2 "-${OPTARG}: invalid option"
            helpdoc
            exit 255
            ;;
        esac
    done

    # Make sure we have necessary dependencies before moving forward.
    # shellcheck disable=SC2207
    IFS=$'\n' missing_deps=($(check_dependencies))

    if [[ ${#missing_deps} -ne 0 ]]
    then
        {
            echo "Dependencies are missing. Please install them and then try running the script again."
            echo
            echo "Missing dependencies:"

            for dep in "${missing_deps[@]}"
            do
                echo -e "\t${dep}"
            done
        } >&2
        return 1
    fi

    # Special handling for -I.
    if [[ -n "${import_onions_from_file}" ]]
    then
        # Abort if -I and -i point to the same file.
        if [[ "${import_onions_from_file}" == "${input_file}" ]]
        then
            echo >&2 "-I and -i cannot point to the same file."
            echo >&2 "For more information, run: ${BASH_SOURCE[0]} -h"
            return 1
        fi

        # Set do_tor <- n so that we don't attempt to make tor connections.
        do_tor=n

        # Attempt to read in onion instances.
        # shellcheck disable=SC2207
        # (mapfile not ideal here since a pipe is required, inducing a
        # subshell, meaning nothing will actually get added to
        # imported_onions)
        IFS=$'\n' imported_onions=($(jq -Mcer '.instances[] | select(.onion)' "${import_onions_from_file}"))
        rc=$?

        if [[ ${rc} -ne 0 ]]
        then
            echo >&2 "Failed to read onion instances from existing JSON file."
            return 1
        fi
    fi

    # Check to see if we have tor. If we don't, then we will have to import
    # the existing tor instances from the JSON.
    # TODO: For I2P, we will likely have to do something similar.
    if [[ "${do_tor}" == "n" ]] || ! can_tor
    then
        if [[ "${do_tor}" == "y" ]]
        then
            echo >&2 "WARNING: Either the tor service is not running or torsocks is not available. Either way, onion sites will not be processed."
        fi
        do_tor="n"
        get_opts+=("-T")
    fi

    if [[ "${input_file}" != "/dev/stdin" ]]
    then
        if [[ ! -e "${input_file}" ]]
        then
            echo >&2 "${input_file}: No such file or directory"
            return 1
        fi

        if [[ -d "${input_file}" ]]
        then
            echo >&2 "${input_file}: Is a directory"
            return 1
        fi
    fi

    # Read in the CSV.
    local -a rows=()
    <"${input_file}" mapfile rows
    rc=0

    if [[ ${rc} -ne 0 ]]
    then
        return ${rc}
    fi

    # Process the CSV, row by row.
    local -a values=()
    local -a failed=()
    local l=1
    local url=
    for row in "${rows[@]}"
    do
        # shellcheck disable=SC2207
        IFS=$'\n' values=($(read_csv_row "${row}"))
        rc=$?

        if [[ ${rc} -ne 0 || ${#values[@]} -lt 3 || ${#values[@]} -gt 4 ]]
        then
            echo >&2 "${l}: failed to parse row"
            echo >&2 "Script will now terminate."
            return 2
        fi

        # Print friendly message to log while processing row.
        url="${values[0]}"
        echo -n >&2 "${url}: "

        instance_entry="$(IFS=$'\n' create_instance_entry "${get_opts[@]}" "${values[@]}")"
        rc=$?

        if [[ ${rc} -eq 0 ]]
        then
            IFS=$'\n' instance_entries+=("${instance_entry}")
            echo "OK"
        elif [[ ${rc} -eq 100 ]]
        then
            # rc=100 means the onion site is skipped because we told
            # create_instance_entry to skip the onion site.
            echo "SKIPPED"
        else
            echo "FAILED"

            if [[ "${failfast}" == "y" ]]
            then
                return 1
            fi

            failed+=("${url}")
        fi >&2

        (( l++ ))
        rc=0
    done

    # Assemble everything into JSON.
    # TODO: see if this can be done in one jq call, without having
    # to pass the list to jq --slurp and then everything to jq.
    printf '{"updated":"%s","instances":%s}' "${TODAY}" "$(IFS=$'\n'
        for instance in "${instance_entries[@]}" "${imported_onions[@]}"
        do
            echo "${instance}"
        done | jq -Mcers .
    )" | jq -Mer . >"${output_file}"
    rc=$?

    if [[ ${rc} -ne 0 ]]
    then
        echo >&2 "There was a problem processing the JSON. The output file may be corrupted."
    fi

    if [[ ${#failed[@]} -gt 0 ]]
    then
        {
            echo "The following instances could not be reached:"
            for failed_url in "${failed[@]}"
            do
                echo -e "\t${failed_url}"
            done
        } >&2

        return 1
    fi

    return ${rc}
}

if [[ "${BASH_SOURCE[0]}" == "${0}" ]]
then
    main "${@}"
    exit
fi