Skip to content

Wiki to Text

Adrian Wilke edited this page Feb 9, 2021 · 5 revisions
#!/bin/bash

if [[ $# -ne 2 ]] ; then
    echo 'Please provide: <input directory> <output directory>'
    exit 1
fi

# Remove slash at end
INDIR=${1%/}
OUTDIR=${2%/}

for FILEPATH in $INDIR/*
do
  # Only file name
  FILE="$(basename -- $FILEPATH)"

  # Convert from wiki-markup to plain text
  pandoc --filter pandoc-citeproc -f mediawiki -t plain -o $OUTDIR/$FILE $INDIR/$FILE

  # Remove markers [1]
  sed -i 's/\[[^]]*\]//g' $OUTDIR/$FILE
  
  # Remove empty lines
  sed -i '/^[[:space:]]*$/d' $OUTDIR/$FILE
done

# Data Science Group (DICE) at Paderborn University
# This work has been supported by the German Federal Ministry of Education and Research (BMBF) within the project EML4U under the grant no 01IS19080B.
Clone this wiki locally