Skip to content

tokenizer.perl error

utf8 does not map to Unicode at /usr/local/scripts/tokenizer/tokenizer.perl

 

solution:

change file to utf-8

iconv -f ISO-8859-1 -t UTF8 file-original.txt -o file-converted.txt

irstlm error

/usr/include/c++/7/cstdlib:75:15: fatal error: stdlib.h: No such file or directory

 

solution:

replace src/Makefile.am

from

AM_CXXFLAGS = -static -isystem/usr/include -W -Wall -ffor-scope -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES $(BOOST_CPPFLAGS) -DMYCODESIZE=3

to

AM_CXXFLAGS = -static -I/usr/include -W -Wall -ffor-scope -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES $(BOOST_CPPFLAGS) -DMYCODESIZE=3

lmplz error

terminate called after throwing an instance of 'lm::builder::BadDiscountException'

 what(): /usr/local/src/mosesdecoder/lm/builder/adjust_counts.cc:61 in void lm::builder::{anonymous}::StatCollector::CalculateDiscounts(const lm::builder::DiscountConfig&) threw BadDiscountException because `discounts_[i].amount[j] < 0.0 || discounts_[i].amount[j] > j'.

ERROR: 4-gram discount out of range for adjusted count 3: -0.57606

solution:

  • lower your n-gram
  • or add more monolingual corpus

train-model.perl error

 WARNING: XX is a bad alignment point in sentence 1234

or

phrase-table.half.0000000.gz: not in gzip format

solution:

don’t use symal from mgiza, use symal from moses instead

Installing Moses Decoder on Ubuntu

  1. Get the source from here https://github.com/moses-smt/mosesdecoder
  2. Installing some dependencies
    sudo apt-get install libboost-all-dev
    sudo apt-get install zlib1g-dev
    sudo apt-get install libbz2-dev

    ————————
    bzip2 library manual installation

    wget http://www.bzip.org/1.0.6/bzip2-1.0.6.tar.gz
    tar xzvf bzip2-1.0.6.tar.gz
    cd bzip2-1.0.6/
    
    make
    make install
    cp libbz2.a /usr/lib
    
    make clean
    make -f Makefile-libbz2_so
    cp libbz2.so.* /usr/lib
    ln -sf libbz2.so.1.0 /usr/lib/libbz2.so
  3. Extracting
    mkdir -p /usr/local/src/mosesdecoder
    sudo chown ubuntu /usr/local/src/mosesdecoder/
    
    unzip mosesdecoder-master.zip -d /usr/local/src/mosesdecoder/
  4. Edit some RandLM header files if required
    ————
    RandLMQuantizer.h

    static const float kFloatErr = 0.00001f;

    to

    static constexpr float kFloatErr = 0.00001f;

    ————-
    RandLM.h

    static const float kNullLogProb = -1000000;
    static const float kUnknownLogProb = 1000000;

    to

    static constexpr float kNullLogProb = -1000000;
    static constexpr float kUnknownLogProb = 1000000;

    ————
    RandLMTypes.h

    #define iterate(c,i) for(typeof(c.begin()) i = c.begin(); i != c.end(); i++)

    to

    #define iterate(c,i) for(__typeof__(c.begin()) i = c.begin(); i != c.end(); i++)
  5. Compiling
    ./bjam --prefix=/usr/local/lib/mosesdecoder --with-irstlm=/usr/local/lib/irstlm --with-randlm=/usr/local/lib/randlm -j4 --debug-configuration -d2 >build.log
  6. Linking some files
    cp -r scripts /usr/local/lib/mosesdecoder
    
    sudo ln -sfn /usr/local/lib/mosesdecoder/bin/CreateOnDiskPt /usr/local/bin
    sudo ln -sfn /usr/local/lib/mosesdecoder/bin/build_binary /usr/local/bin
    sudo ln -sfn /usr/local/lib/mosesdecoder/bin/query /usr/local/bin
    sudo ln -sfn /usr/local/lib/mosesdecoder/bin/extractor /usr/local/bin
    sudo ln -sfn /usr/local/lib/mosesdecoder/bin/mert /usr/local/bin
    sudo ln -sfn /usr/local/lib/mosesdecoder/bin/processLexicalTable /usr/local/bin
    sudo ln -sfn /usr/local/lib/mosesdecoder/bin/processPhraseTable /usr/local/bin
    sudo ln -sfn /usr/local/lib/mosesdecoder/bin/queryLexicalTable /usr/local/bin
    sudo ln -sfn /usr/local/lib/mosesdecoder/bin/queryPhraseTable /usr/local/bin
    sudo ln -sfn /usr/local/lib/mosesdecoder/bin/moses_chart /usr/local/bin
    sudo ln -sfn /usr/local/lib/mosesdecoder/bin/moses /usr/local/bin
    sudo ln -sfn /usr/local/lib/mosesdecoder/bin/lmbrgrid /usr/local/bin
    
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/analysis/sentence-by-sentence.pl /usr/local/bin/sentence-by-sentence.pl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/ems/experiment.perl /usr/local/bin/experiment.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/ems/support/analysis.perl /usr/local/bin/analysis.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/ems/support/berkeley-process.sh /usr/local/bin/berkeley-process.sh
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/ems/support/berkeley-train.sh /usr/local/bin/berkeley-train.sh
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/ems/support/consolidate-training-data.perl /usr/local/bin/consolidate-training-data.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/ems/support/generic-multicore-parallelizer.perl /usr/local/bin/generic-multicore-parallelizer.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/ems/support/generic-parallelizer.perl /usr/local/bin/generic-parallelizer.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/ems/support/input-from-sgm.perl /usr/local/bin/input-from-sgm.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/ems/support/interpolate-lm.perl /usr/local/bin/interpolate-lm.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/ems/support/reference-from-sgm.perl /usr/local/bin/reference-from-sgm.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/ems/support/remove-segmenation-markup.perl /usr/local/bin/remove-segmenation-markup.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/ems/support/report-experiment-scores.perl /usr/local/bin/report-experiment-scores.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/ems/support/reuse-weights.perl /usr/local/bin/reuse-weights.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/ems/support/run-command-on-multiple-refsets.perl /usr/local/bin/run-command-on-multiple-refsets.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/ems/support/wrap-xml.perl /usr/local/bin/wrap-xml.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/generic/compound-splitter.perl /usr/local/bin/compound-splitter.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/generic/extract-factors.pl /usr/local/bin/extract-factors.pl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/generic/lopar2pos.pl /usr/local/bin/lopar2pos.pl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/generic/moses-parallel.pl /usr/local/bin/moses-parallel.pl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/generic/mteval-v12.pl /usr/local/bin/mteval-v12.pl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/generic/multi-bleu.perl /usr/local/bin/multi-bleu.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/generic/qsub-wrapper.pl /usr/local/bin/qsub-wrapper.pl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/tokenizer/detokenizer.perl /usr/local/bin/detokenizer.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/tokenizer/tokenizer.perl /usr/local/bin/tokenizer.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/tokenizer/lowercase.perl /usr/local/bin/lowercase.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/absolutize_moses_model.pl /usr/local/bin/absolutize_moses_model.pl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/build-generation-table.perl /usr/local/bin/build-generation-table.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/clean-corpus-n.perl /usr/local/bin/clean-corpus-n.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/clone_moses_model.pl /usr/local/bin/clone_moses_model.pl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/mbr/mbr /usr/local/bin/mbr
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/filter-model-given-input.pl /usr/local/bin/filter-model-given-input.pl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/filter-rule-table.py /usr/local/bin/filter-rule-table.py
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/lexical-reordering/score /usr/local/bin/score
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/memscore/memscore /usr/local/bin/memscore
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/zmert-moses.pl /usr/local/bin/zmert-moses.pl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/mert-moses.pl /usr/local/bin/mert-moses.pl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/phrase-extract/extract /usr/local/bin/extract
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/phrase-extract/extract-rules /usr/local/bin/extract-rules
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/phrase-extract/score /usr/local/bin/score
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/phrase-extract/consolidate /usr/local/bin/consolidate
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/postprocess-lopar.perl /usr/local/bin/postprocess-lopar.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/reduce_combine.pl /usr/local/bin/reduce_combine.pl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/combine_factors.pl /usr/local/bin/combine_factors.pl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/train-model.perl /usr/local/bin/train-model.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/symal/symal /usr/local/bin/symal
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/symal/giza2bal.pl /usr/local/bin/giza2bal.pl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/wrappers/parse-de-bitpar.perl /usr/local/bin/parse-de-bitpar.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/wrappers/parse-en-collins.perl /usr/local/bin/parse-en-collins.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/wrappers/make-factor-en-pos.mxpost.perl /usr/local/bin/make-factor-en-pos.mxpost.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/wrappers/make-factor-pos.tree-tagger.perl /usr/local/bin/make-factor-pos.tree-tagger.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/training/wrappers/make-factor-stem.perl /usr/local/bin/make-factor-stem.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/recaser/train-recaser.perl /usr/local/bin/train-recaser.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/recaser/recase.perl /usr/local/bin/recase.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/recaser/truecase.perl /usr/local/bin/truecase.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/recaser/detruecase.perl /usr/local/bin/detruecase.perl
    sudo ln -sfn /usr/local/lib/mosesdecoder/scripts/recaser/train-truecaser.perl /usr/local/bin/train-truecaser.perl

Installing RandLM on Ubuntu

  1. Get RandLM source from here http://sourceforge.net/projects/randlm/
  2. Installing some dependencies
    sudo apt-get install sparsehash
  3. Extracting
    mkdir -p /usr/local/src/randlm
    sudo chown username /usr/local/src/randlm/
    tar -xvzf randlm.tar.gz -C /usr/local/src/randlm/
  4. Compiling
    sudo mkdir -p /usr/local/lib/randlm
    sudo chown username /usr/local/lib/randlm
    ./configure --prefix=/usr/local/lib/randlm
    mv aclocal.m4 aclocal.m4_
    aclocal
    autoconf
    make
  5. Installing
    make install
    cp hadoop/m_compute_ngram_counts_batch /usr/local/lib/randlm/bin
    cp hadoop/r_compute_ngram_counts /usr/local/lib/randlm/bin
    cp hadoop/m_compute_ngram_counts_batch /usr/local/lib/randlm/bin/m-compute-ngram-counts-batch
    cp hadoop/r_compute_ngram_counts /usr/local/lib/randlm/bin/r-compute-ngram-counts
    sudo ln -sfn /usr/local/lib/randlm/bin/buildlm /usr/local/bin
    sudo ln -sfn /usr/local/lib/randlm/bin/querylm /usr/local/bin
    sudo ln -sfn /usr/local/lib/randlm/bin/m_compute_ngram_counts_batch /usr/local/bin
    sudo ln -sfn /usr/local/lib/randlm/bin/r_compute_ngram_counts /usr/local/bin
    sudo ln -sfn /usr/local/lib/randlm/bin/m-compute-ngram-counts-batch /usr/local/bin
    sudo ln -sfn /usr/local/lib/randlm/bin/r-compute-ngram-counts /usr/local/bin

Installing MGIZA++ on Ubuntu

  1. Get the source from here http://sourceforge.net/projects/mgizapp/
  2. Installing some dependencies
    sudo apt-get install cmake
    sudo apt-get install libboost-all-dev
  3. Extracting source files
    sudo mkdir -p /usr/local/src/mgizapp
    sudo chown username /usr/local/src/mgizapp/
    tar -xvzf mgizapp.tgz -C /usr/local/src/mgizapp/
  4. Configuring
    sudo mkdir -p /usr/local/lib/mgizapp
    sudo chown username /usr/local/lib/mgizapp/
    rm CMakeCache.txt
    cmake .
    make
  5. Installing
    make install
    cp -r inst/* /usr/local/lib/mgizapp/
  6. Creating snt2cooc.out file (GIZA++ compabilty)
    #! /bin/bash
    set -e
    usage() {
        echo "Usage: snt2cooc.out vcb1 vcb2 snt12"
        echo "Converts GIZA++ snt-format into plain text."
        exit 1
    }
    [ $# -ne 3 ] && usage
    ${0%/*}/snt2cooc /dev/stdout $1 $2 $3
    exit 0

    chmod 755 snt2cooc.out

  7. Linking some files
    sudo ln -sfn /usr/local/lib/mgizapp/bin/mgiza /usr/local/bin/mgizapp
    sudo ln -sfn /usr/local/lib/mgizapp/bin/mgiza /usr/local/bin/GIZA++
    sudo ln -sfn /usr/local/lib/mgizapp/bin/mgiza /usr/local/bin
    sudo ln -sfn /usr/local/lib/mgizapp/bin/snt2cooc /usr/local/bin
    sudo ln -sfn /usr/local/lib/mgizapp/bin/mkcls /usr/local/bin
    sudo ln -sfn /usr/local/lib/mgizapp/bin/snt2cooc.out /usr/local/bin
    sudo ln -sfn /usr/local/lib/mgizapp/scripts/merge_alignment.py /usr/local/bin

Installing IRSTLM on Ubuntu

  1. Get the source from here http://sourceforge.net/projects/irstlm/
  2. Installing some dependencies
    sudo apt-get install build-essential
    sudo apt-get install automake
    sudo apt-get install libtool
    
    sudo apt-get install zlib1g-dev
  3. Extracting files
    sudo mkdir -p /usr/local/src/irstlm
    sudo chown username /usr/local/src/irstlm
    
    tar -xvzf irstlm.tgz -C /usr/local/src/irstlm
  4. Compiling
    sudo mkdir -p /usr/local/lib/irstlm 
    sudo chown username /usr/local/lib/irstlm
    
    ./regenerate-makefiles.sh
    
    ./configure --prefix=/usr/local/lib/irstlm --enable-caching
    
    make -j 4
  5. Installing
    make -j 4 install
    
    sudo ln -sfn /usr/local/lib/irstlm/bin/add-start-end.sh /usr/local/bin
    sudo ln -sfn /usr/local/lib/irstlm/bin/build-lm-qsub.sh /usr/local/bin
    sudo ln -sfn /usr/local/lib/irstlm/bin/build-lm.sh /usr/local/bin
    sudo ln -sfn /usr/local/lib/irstlm/bin/build-sublm.pl /usr/local/bin
    sudo ln -sfn /usr/local/lib/irstlm/bin/compile-lm /usr/local/bin
    sudo ln -sfn /usr/local/lib/irstlm/bin/dict /usr/local/bin
    sudo ln -sfn /usr/local/lib/irstlm/bin/goograms2ngrams.pl /usr/local/bin
    sudo ln -sfn /usr/local/lib/irstlm/bin/interpolate-lm /usr/local/bin
    sudo ln -sfn /usr/local/lib/irstlm/bin/lm-stat.pl /usr/local/bin
    sudo ln -sfn /usr/local/lib/irstlm/bin/merge-sublm.pl /usr/local/bin
    sudo ln -sfn /usr/local/lib/irstlm/bin/ngram-split.pl /usr/local/bin
    sudo ln -sfn /usr/local/lib/irstlm/bin/ngt /usr/local/bin
    sudo ln -sfn /usr/local/lib/irstlm/bin/plsa /usr/local/bin
    sudo ln -sfn /usr/local/lib/irstlm/bin/prune-lm /usr/local/bin
    sudo ln -sfn /usr/local/lib/irstlm/bin/quantize-lm /usr/local/bin
    sudo ln -sfn /usr/local/lib/irstlm/bin/rm-start-end.sh /usr/local/bin
    sudo ln -sfn /usr/local/lib/irstlm/bin/score-lm /usr/local/bin
    sudo ln -sfn /usr/local/lib/irstlm/bin/sort-lm.pl /usr/local/bin
    sudo ln -sfn /usr/local/lib/irstlm/bin/split-dict.pl /usr/local/bin
    sudo ln -sfn /usr/local/lib/irstlm/bin/split-ngt.sh /usr/local/bin
    sudo ln -sfn /usr/local/lib/irstlm/bin/tlm /usr/local/bin