input_file="${1:-/dev/stdin}"
declare -A seen_dependencies
alphanum="[a-zA-Z0-9]"
name_chars="[a-zA-Z0-9._-]"
version_chars="[a-zA-Z0-9._+-]"
group_artifact="${alphanum}${name_chars}*"
version_pattern="${alphanum}${version_chars}*${alphanum}"
NS_LICENSE_NAME='//*[local-name()="licenses"]/*[local-name()="license"]/*[local-name()="name"]/text()'
NS_LICENSE_URL='//*[local-name()="licenses"]/*[local-name()="license"]/*[local-name()="url"]/text()'
NS_PROJECT_NAME='/*[local-name()="project"]/*[local-name()="name"]/text()'
NS_ORG_NAME='//*[local-name()="organization"]/*[local-name()="name"]/text()'
NS_SCM_URL='//*[local-name()="scm"]/*[local-name()="url"]/text()'
NS_DEVELOPER_NAMES='//*[local-name()="developers"]/*[local-name()="developer"]/*[local-name()="name"]/text()'
out() {
echo -n "$*"
}
field() {
local key="$1"
local value="$2"
local comma="$3"
out "\"$key\":\"$value\""
[[ "$comma" == "true" ]] && out ","
}
map_spdx_id() {
case "$1" in
*Apache*2.0*) echo "Apache-2.0" ;;
*MIT*) echo "MIT" ;;
*BSD*2*) echo "BSD-2-Clause" ;;
*2*BSD*) echo "BSD-2-Clause" ;;
*3*BSD*) echo "BSD-3-Clause" ;;
*BSD*3*) echo "BSD-3-Clause" ;;
*LGPL*2*later) echo "LGPL-2.1-or-later" ;;
*LGPL*2*) echo "LGPL-2.1-only" ;;
*LGPL*3*) echo "LGPL-3.0-only" ;;
*GPL*2*CE*) echo "GPL-2.0-with-classpath-exception" ;;
*GPL*2*) echo "GPL-2.0-only" ;;
*GPLv2*) echo "GPL-2.0-only" ;;
*GPL*3*) echo "GPL-3.0-only" ;;
*EPL*1.0*) echo "EPL-1.0" ;;
*EPL*2.0*) echo "EPL-2.0" ;;
*MPL*1.1*) echo "MPL-1.1" ;;
*Mozilla*1.1*) echo "MPL-1.1" ;;
*MPL*2.0*) echo "MPL-2.0" ;;
*Mozilla*2.0*) echo "MPL-2.0" ;;
*) echo "UNKNOWN" ;;
esac
}
out "["; is_first=1
while IFS= read -r line; do
[[ ! "$line" =~ ^[[:space:]]*[\+\|\\].*--- ]] && continue
cleaned=$(echo "$line" | awk '{sub(/^[[:space:]]*[-+|\\]+[[:space:]]*/, ""); sub(/[[:space:]]*$/, ""); print}')
[[ "$cleaned" =~ \(\*\)$ ]] && continue
cleaned=$(echo "$cleaned" | awk '{sub(/[[:space:]]*\([^)]*\)$/, ""); sub(/ -> .*/, ""); print}')
if [[ "$cleaned" =~ ^(${group_artifact}):(${group_artifact}):(${version_pattern})$ ]]; then
group="${BASH_REMATCH[1]}"
artifact="${BASH_REMATCH[2]}"
version="${BASH_REMATCH[3]}"
key="$group:$artifact:$version"
[[ -n "${seen_dependencies[$key]}" ]] && continue
seen_dependencies[$key]=1
group_path="${group//./\/}"
pom_url="https://repo1.maven.org/maven2/$group_path/$artifact/$version/$artifact-$version.pom"
page_url="https://mvnrepository.com/artifact/$group/$artifact/$version"
license_names=()
license_urls=()
developer_names=()
project_name="$artifact"
copyright=""
source_url=""
homepage_url=""
if pom=$(wget -q -O - "$pom_url"); then
project_name=$(echo "$pom" | xmllint --xpath "string($NS_PROJECT_NAME)" - 2>/dev/null)
[[ -z "$project_name" ]] && project_name="$artifact"
copyright=$(echo "$pom" | xmllint --xpath "string($NS_ORG_NAME)" - 2>/dev/null)
source_url=$(echo "$pom" | xmllint --xpath "string($NS_SCM_URL)" - 2>/dev/null)
mapfile -t license_names < <(echo "$pom" | xmllint --xpath "$NS_LICENSE_NAME" - 2>/dev/null)
mapfile -t license_urls < <(echo "$pom" | xmllint --xpath "$NS_LICENSE_URL" - 2>/dev/null)
mapfile -t developer_names < <(echo "$pom" | xmllint --xpath "$NS_DEVELOPER_NAMES" - 2>/dev/null)
fi
html_content=$(wget -q --user-agent="KeenWrite/5.0" -O - "$page_url")
homepage_url=$(echo "$html_content" | xmllint --html --xpath '//tr[th[text()="HomePage"]]/td/a/@href' - 2>/dev/null | sed 's/href="//;s/"$//')
if [[ -z "$project_name" ]]; then
project_name=$(echo "$html_content" | xmllint --html --xpath 'string(//h2[@class="title"])' - 2>/dev/null | sed -E 's/^[[:space:]]*//;s/[[:space:]]*$//')
[[ -z "$project_name" ]] && project_name=$(echo "$html_content" | xmllint --html --xpath 'string(//title)' - 2>/dev/null | sed -E 's/ - MVNRepository$//')
fi
if [[ ${#license_names[@]} -eq 0 ]]; then
mapfile -t license_names < <(echo "$html_content" | xmllint --html --xpath '//div[@class="version-section"]/h2[text()="Licenses"]/following-sibling::table//tr/td[1]/text()' - 2>/dev/null)
mapfile -t license_urls < <(echo "$html_content" | xmllint --html --xpath '//div[@class="version-section"]/h2[text()="Licenses"]/following-sibling::table//tr/td[2]/a/@href' - 2>/dev/null | sed 's/href="//;s/"$//')
fi
if [[ ${#developer_names[@]} -eq 0 ]]; then
mapfile -t developer_names < <(echo "$html_content" | xmllint --html --xpath '//h2[text()="Developers"]/following-sibling::div//tbody/tr/td[1]/text()' - 2>/dev/null)
fi
[[ $is_first -eq 0 ]] && out ","
is_first=0
out "{"
field "group" "$group" true
field "artifact" "$artifact" true
field "version" "$version" true
field "url" "$page_url" true
field "name" "$project_name" true
out "\"licenses\":["
for i in "${!license_names[@]}"; do
[[ $i -gt 0 ]] && out ","
name="${license_names[$i]}"
url="${license_urls[$i]}"
spdx=$(map_spdx_id "$name")
out "{"
field "name" "$name" true
field "url" "$url" true
field "spdx" "$spdx" false
out "}"
done
out "],"
field "copyright" "$copyright" true
field "source" "$source_url" true
field "homepage" "$homepage_url" true
out "\"developers\":["
for i in "${!developer_names[@]}"; do
[[ $i -gt 0 ]] && out ","
out "\"${developer_names[$i]}\""
done
out "]"
out "}"
fi
done < "$input_file"
out "]"