Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenwrite.git
#!/usr/bin/env bash

input_file="${1:-/dev/stdin}"
declare -A seen_dependencies

alphanum="[a-zA-Z0-9]"
name_chars="[a-zA-Z0-9._-]"
version_chars="[a-zA-Z0-9._+-]"
group_artifact="${alphanum}${name_chars}*"
version_pattern="${alphanum}${version_chars}*${alphanum}"

NS_LICENSE_NAME='//*[local-name()="licenses"]/*[local-name()="license"]/*[local-name()="name"]/text()'
NS_LICENSE_URL='//*[local-name()="licenses"]/*[local-name()="license"]/*[local-name()="url"]/text()'
NS_PROJECT_NAME='/*[local-name()="project"]/*[local-name()="name"]/text()'
NS_ORG_NAME='//*[local-name()="organization"]/*[local-name()="name"]/text()'
NS_SCM_URL='//*[local-name()="scm"]/*[local-name()="url"]/text()'
NS_DEVELOPER_NAMES='//*[local-name()="developers"]/*[local-name()="developer"]/*[local-name()="name"]/text()'

out() {
  echo -n "$*"
}

field() {
  local key="$1"
  local value="$2"
  local comma="$3"
  out "\"$key\":\"$value\""
  [[ "$comma" == "true" ]] && out ","
}

map_spdx_id() {
  case "$1" in
    *Apache*2.0*) echo "Apache-2.0" ;;
    *MIT*) echo "MIT" ;;
    *BSD*2*) echo "BSD-2-Clause" ;;
    *2*BSD*) echo "BSD-2-Clause" ;;
    *3*BSD*) echo "BSD-3-Clause" ;;
    *BSD*3*) echo "BSD-3-Clause" ;;
    *LGPL*2*later) echo "LGPL-2.1-or-later" ;;
    *LGPL*2*) echo "LGPL-2.1-only" ;;
    *LGPL*3*) echo "LGPL-3.0-only" ;;
    *GPL*2*CE*) echo "GPL-2.0-with-classpath-exception" ;;
    *GPL*2*) echo "GPL-2.0-only" ;;
    *GPLv2*) echo "GPL-2.0-only" ;;
    *GPL*3*) echo "GPL-3.0-only" ;;
    *EPL*1.0*) echo "EPL-1.0" ;;
    *EPL*2.0*) echo "EPL-2.0" ;;
    *MPL*1.1*) echo "MPL-1.1" ;;
    *Mozilla*1.1*) echo "MPL-1.1" ;;
    *MPL*2.0*) echo "MPL-2.0" ;;
    *Mozilla*2.0*) echo "MPL-2.0" ;;
    *) echo "UNKNOWN" ;;
  esac
}

out "["; is_first=1

while IFS= read -r line; do
  [[ ! "$line" =~ ^[[:space:]]*[\+\|\\].*--- ]] && continue
  cleaned=$(echo "$line" | awk '{sub(/^[[:space:]]*[-+|\\]+[[:space:]]*/, ""); sub(/[[:space:]]*$/, ""); print}')
  [[ "$cleaned" =~ \(\*\)$ ]] && continue
  cleaned=$(echo "$cleaned" | awk '{sub(/[[:space:]]*\([^)]*\)$/, ""); sub(/ -> .*/, ""); print}')

  if [[ "$cleaned" =~ ^(${group_artifact}):(${group_artifact}):(${version_pattern})$ ]]; then
    group="${BASH_REMATCH[1]}"
    artifact="${BASH_REMATCH[2]}"
    version="${BASH_REMATCH[3]}"
    key="$group:$artifact:$version"
    [[ -n "${seen_dependencies[$key]}" ]] && continue
    seen_dependencies[$key]=1

    group_path="${group//./\/}"
    pom_url="https://repo1.maven.org/maven2/$group_path/$artifact/$version/$artifact-$version.pom"
    page_url="https://mvnrepository.com/artifact/$group/$artifact/$version"

    license_names=()
    license_urls=()
    developer_names=()
    project_name="$artifact"
    copyright=""
    source_url=""
    homepage_url=""

    if pom=$(wget -q -O - "$pom_url"); then
      project_name=$(echo "$pom" | xmllint --xpath "string($NS_PROJECT_NAME)" - 2>/dev/null)
      [[ -z "$project_name" ]] && project_name="$artifact"
      copyright=$(echo "$pom" | xmllint --xpath "string($NS_ORG_NAME)" - 2>/dev/null)
      source_url=$(echo "$pom" | xmllint --xpath "string($NS_SCM_URL)" - 2>/dev/null)
      mapfile -t license_names < <(echo "$pom" | xmllint --xpath "$NS_LICENSE_NAME" - 2>/dev/null)
      mapfile -t license_urls < <(echo "$pom" | xmllint --xpath "$NS_LICENSE_URL" - 2>/dev/null)
      mapfile -t developer_names < <(echo "$pom" | xmllint --xpath "$NS_DEVELOPER_NAMES" - 2>/dev/null)
    fi

    html_content=$(wget -q --user-agent="KeenWrite/5.0" -O - "$page_url")

    homepage_url=$(echo "$html_content" | xmllint --html --xpath '//tr[th[text()="HomePage"]]/td/a/@href' - 2>/dev/null | sed 's/href="//;s/"$//')

    if [[ -z "$project_name" ]]; then
      project_name=$(echo "$html_content" | xmllint --html --xpath 'string(//h2[@class="title"])' - 2>/dev/null | sed -E 's/^[[:space:]]*//;s/[[:space:]]*$//')
      [[ -z "$project_name" ]] && project_name=$(echo "$html_content" | xmllint --html --xpath 'string(//title)' - 2>/dev/null | sed -E 's/ - MVNRepository$//')
    fi

    if [[ ${#license_names[@]} -eq 0 ]]; then
      mapfile -t license_names < <(echo "$html_content" | xmllint --html --xpath '//div[@class="version-section"]/h2[text()="Licenses"]/following-sibling::table//tr/td[1]/text()' - 2>/dev/null)
      mapfile -t license_urls < <(echo "$html_content" | xmllint --html --xpath '//div[@class="version-section"]/h2[text()="Licenses"]/following-sibling::table//tr/td[2]/a/@href' - 2>/dev/null | sed 's/href="//;s/"$//')
    fi

    if [[ ${#developer_names[@]} -eq 0 ]]; then
      mapfile -t developer_names < <(echo "$html_content" | xmllint --html --xpath '//h2[text()="Developers"]/following-sibling::div//tbody/tr/td[1]/text()' - 2>/dev/null)
    fi

    [[ $is_first -eq 0 ]] && out ","
    is_first=0

    out "{"
    field "group" "$group" true
    field "artifact" "$artifact" true
    field "version" "$version" true
    field "url" "$page_url" true
    field "name" "$project_name" true

    out "\"licenses\":["
    for i in "${!license_names[@]}"; do
      [[ $i -gt 0 ]] && out ","
      name="${license_names[$i]}"
      url="${license_urls[$i]}"
      spdx=$(map_spdx_id "$name")
      out "{"
      field "name" "$name" true
      field "url" "$url" true
      field "spdx" "$spdx" false
      out "}"
    done
    out "],"

    field "copyright" "$copyright" true
    field "source" "$source_url" true
    field "homepage" "$homepage_url" true

    out "\"developers\":["
    for i in "${!developer_names[@]}"; do
      [[ $i -gt 0 ]] && out ","
      out "\"${developer_names[$i]}\""
    done
    out "]"

    out "}"
  fi
done < "$input_file"

out "]"