summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichał Górny <mgorny@gentoo.org>2021-10-23 22:37:15 +0200
committerMichał Górny <mgorny@gentoo.org>2021-10-23 22:46:04 +0200
commit8fc9ac325105fab5a44261fd4874d87738d31c43 (patch)
tree7e6b6fe2f84b76a9d4d8860aecf492532656fa7f /dev-python/nltk-data/nltk-data-20211023.ebuild
parentnet-misc/lldpd: update homepage (diff)
downloadgentoo-8fc9ac325105fab5a44261fd4874d87738d31c43.tar.gz
gentoo-8fc9ac325105fab5a44261fd4874d87738d31c43.tar.bz2
gentoo-8fc9ac325105fab5a44261fd4874d87738d31c43.zip
dev-python/nltk-data: Update data files for 20211023
Closes: https://bugs.gentoo.org/819780 Signed-off-by: Michał Górny <mgorny@gentoo.org>
Diffstat (limited to 'dev-python/nltk-data/nltk-data-20211023.ebuild')
-rw-r--r--dev-python/nltk-data/nltk-data-20211023.ebuild203
1 files changed, 203 insertions, 0 deletions
diff --git a/dev-python/nltk-data/nltk-data-20211023.ebuild b/dev-python/nltk-data/nltk-data-20211023.ebuild
new file mode 100644
index 000000000000..df8437c785be
--- /dev/null
+++ b/dev-python/nltk-data/nltk-data-20211023.ebuild
@@ -0,0 +1,203 @@
+# Copyright 2020-2021 Gentoo Authors
+# Distributed under the terms of the GNU General Public License v2
+
+EAPI=7
+
+inherit check-reqs
+
+DESCRIPTION="Data files for NLTK"
+HOMEPAGE="https://www.nltk.org/nltk_data/"
+
+# at least some of the files have poorly documented licenses
+# TODO: create a USE flag for free-ish subset
+LICENSE="all-rights-reserved"
+SLOT="0"
+KEYWORDS="amd64 x86"
+IUSE="extra"
+RESTRICT="bindist mirror"
+
+BDEPEND="app-arch/unzip"
+
+PACKAGES_ZIP_2020=(
+ # wget -O - https://www.nltk.org/nltk_data/ | xml sel -t -m '//package[@unzip=0]' -v @subdir -o "/" -v @id -n - | sort
+ corpora/comtrans
+ corpora/conll2007
+ corpora/jeita
+ corpora/knbc
+ corpora/machado
+ corpora/masc_tagged
+ corpora/nombank.1.0
+ corpora/panlex_swadesh
+ corpora/propbank
+ corpora/reuters
+ corpora/semcor
+ corpora/universal_treebanks_v20
+ sentiment/vader_lexicon
+ stemmers/snowball_data
+)
+
+PACKAGES_UNPACK_2020=(
+ # wget -O - https://www.nltk.org/nltk_data/ | xml sel -t -m '//package[@unzip=1]' -v @subdir -o "/" -v @id -n - | sort
+ corpora/abc
+ corpora/alpino
+ corpora/brown
+ corpora/cess_cat
+ corpora/cess_esp
+ corpora/chat80
+ corpora/city_database
+ corpora/cmudict
+ corpora/comparative_sentences
+ corpora/conll2000
+ corpora/conll2002
+ corpora/crubadan
+ corpora/dependency_treebank
+ corpora/dolch
+ corpora/europarl_raw
+ corpora/floresta
+ corpora/framenet_v15
+ corpora/framenet_v17
+ corpora/gazetteers
+ corpora/genesis
+ corpora/gutenberg
+ corpora/ieer
+ corpora/inaugural
+ corpora/indian
+ corpora/lin_thesaurus
+ corpora/mac_morpho
+ corpora/movie_reviews
+ corpora/mte_teip5
+ corpora/names
+ corpora/nonbreaking_prefixes
+ corpora/nps_chat
+ corpora/omw
+ corpora/opinion_lexicon
+ corpora/pl196x
+ corpora/ppattach
+ corpora/product_reviews_1
+ corpora/product_reviews_2
+ corpora/pros_cons
+ corpora/ptb
+ corpora/qc
+ corpora/rte
+ corpora/senseval
+ corpora/sentence_polarity
+ corpora/sentiwordnet
+ corpora/shakespeare
+ corpora/sinica_treebank
+ corpora/state_union
+ corpora/subjectivity
+ corpora/swadesh
+ corpora/switchboard
+ corpora/timit
+ corpora/toolbox
+ corpora/treebank
+ corpora/twitter_samples
+ corpora/udhr
+ corpora/udhr2
+ corpora/verbnet
+ corpora/webtext
+ corpora/wordnet
+ corpora/wordnet_ic
+ corpora/words
+ grammars/book_grammars
+ grammars/large_grammars
+ grammars/sample_grammars
+ misc/perluniprops
+ models/bllip_wsj_no_aux
+ models/moses_sample
+ models/wmt15_eval
+ models/word2vec_sample
+ stemmers/porter_test
+ stemmers/rslp
+ taggers/averaged_perceptron_tagger
+ taggers/averaged_perceptron_tagger_ru
+ taggers/universal_tagset
+ tokenizers/punkt
+)
+
+PACKAGES_UNPACK_2021=(
+ corpora/stopwords
+ corpora/wordnet31
+)
+
+PACKAGES_UNPACK_EXTRA_2020=(
+ chunkers/maxent_ne_chunker
+ corpora/biocreative_ppi
+ corpora/brown_tei
+ corpora/kimmo
+ corpora/paradigms
+ corpora/pe08
+ corpora/pil
+ corpora/problem_reports
+ corpora/smultron
+ corpora/unicode_samples
+ corpora/verbnet3
+ corpora/ycoe
+ grammars/basque_grammars
+ grammars/spanish_grammars
+ help/tagsets
+ misc/mwa_ppdb
+ taggers/maxent_treebank_pos_tagger
+)
+
+add_data() {
+ local x version=${1}
+ shift
+
+ for x; do
+ SRC_URI+="
+ https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/${x}.zip
+ -> nltk-${x#*/}-${version}.zip"
+ done
+}
+
+add_data 20200312 "${PACKAGES_ZIP_2020[@]}" "${PACKAGES_UNPACK_2020[@]}"
+add_data 20211023 "${PACKAGES_UNPACK_2021[@]}"
+SRC_URI+="
+ extra? ("
+add_data 20200312 "${PACKAGES_UNPACK_EXTRA_2020[@]}"
+SRC_URI+="
+ )"
+
+CHECKREQS_DISK_USR=3G
+CHECKREQS_DISK_BUILD=${CHECKREQS_DISK_USR}
+
+unpack_data() {
+ local x version=${1}
+ shift
+
+ for x; do
+ local cat=${x%/*}
+ local pkg=${x#*/}
+
+ mkdir -p "${S}/${cat}" || die
+ cd "${S}/${cat}" || die
+ unpack "nltk-${pkg}-${version}.zip"
+ done
+}
+
+src_unpack() {
+ unpack_data 20200312 "${PACKAGES_UNPACK_2020[@]}"
+ unpack_data 20211023 "${PACKAGES_UNPACK_2021[@]}"
+ use extra && unpack_data 20200312 "${PACKAGES_UNPACK_EXTRA_2020[@]}"
+}
+
+install_zips() {
+ local x version=${1}
+ shift
+
+ for x; do
+ local cat=${x%/*}
+ local pkg=${x#*/}
+
+ insinto "/usr/share/nltk_data/${cat}"
+ newins "${DISTDIR}/nltk-${pkg}-${version}.zip" "${pkg}.zip"
+ done
+}
+
+src_install() {
+ dodir /usr/share/nltk_data
+ mv * "${ED}/usr/share/nltk_data/" || die
+
+ install_zips 20200312 "${PACKAGES_ZIP_2020[@]}"
+}