Skip to content

Commit

Permalink
some fixes for new corpora
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgtied committed Nov 1, 2023
1 parent d99bb71 commit f016a93
Show file tree
Hide file tree
Showing 4,980 changed files with 21,431 additions and 1,241,541 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
13 changes: 8 additions & 5 deletions Makefile.def
Original file line number Diff line number Diff line change
Expand Up @@ -164,15 +164,18 @@ GZCAT := ${GZIP} -cd
ZCAT := gzip -cd

SORT := sort -T ${TMPDIR} -S1G --parallel=${THREADS}
# SORT := sort -T ${TMPDIR} --parallel=${THREADS} -S1G --compress-program=${GZIP}
# SORT := sort -T ${TMPDIR} --parallel=${THREADS} -S1G
# SORT := sort -T ${TMPDIR} -S1G --parallel=${THREADS} --compress-program=${GZIP}

UNIQ := ${SORT} -u
MERGE := ${SORT} -m -u

ifneq (${shell which parsort 2>/dev/null},)
SORT := parsort -T ${TMPDIR} -S1G
endif

## I don't really trust parsort ...

# ifneq (${shell which parsort 2>/dev/null},)
# SORT := parsort -T ${TMPDIR} -S1G
# endif



## seems to be necessary to run with threads on HPC nodes
Expand Down
2 changes: 1 addition & 1 deletion OPUS
Submodule OPUS updated 682 files
2 changes: 1 addition & 1 deletion OPUS-website
Submodule OPUS-website updated 445 files
9 changes: 1 addition & 8 deletions corpus/HPLT/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,6 @@ all-job:
# ${MAKE} HPC_CORES=1 HPC_MEM=4g HPC_TIME=72:00 fetch-job.submit


new-sr:
${MAKE} -C src all
${MAKE} LANGUAGE=sr annotate_files
${MAKE} LANGUAGE=en annotate_files
${MAKE} moses
${MAKE} copy-tmx-files
${MAKE} publish


fetch-job:
${MAKE} -C src fetch
Expand Down Expand Up @@ -68,6 +60,7 @@ publish-job:
${MAKE} moses
${MAKE} copy-tmx-files
${MAKE} publish
${MAKE} release
# ${MAKE} wordalign/Makefile
# ${MAKE} -C wordalign submit-all
# ${MAKE} HPC_CORES=8 HPC_MEM=8g HPC_TIME=72:00 udparse.submit
Expand Down
4 changes: 2 additions & 2 deletions corpus/HPLT/Makefile.def
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@

CORPUS = HPLT

RELEASE = 1
RELEASE = 1.1
VERSION = v${RELEASE}

# with SRCHTML you can define some extra HTML code that will be placed just
# after the name of the sub-corpus
# EXTRAHTML is code which will be put just after the 'Download' header

SRCHTML = Parallel corpora from Web Crawls collected in the <a href="https://hplt-project.org/">HPLT project</a>
EXTRAHTML = The data is released under the <a href="https://creativecommons.org/share-your-work/public-domain/cc0/">Creative Commons CC0 license</a> ("no rights reserved"), see <a href="http://paracrawl.eu/download.html">http://paracrawl.eu/download.html</a> for more details
EXTRAHTML = The data packaging is released under the <a href="https://creativecommons.org/share-your-work/public-domain/cc0/">Creative Commons CC0 license</a> ("no rights reserved"), see <a href="https://hplt-project.org/">https://hplt-project.org/</a> for more details
CITENOTE = Please, acknowledge the HPLT project at <a href="https://hplt-project.org/">https://hplt-project.org/</a>. This version is derived from the original release at their website adjusted for redistribution via the OPUS corpus collection. Please, acknowledge OPUS as well for this service.
Loading

0 comments on commit f016a93

Please sign in to comment.