tag-checker.cpp

Util to validate morpho dictionary entries from stdin (requires Maca, compile: g++ -lmaca tag-checker.cpp -o tag-checker) - Adam Radziszewski, 26 Jan 2011 13:40

Download (3.33 KB)

 
1
#include <libcorpus2/util/settings.h>
2
#include <libcorpus2/tagsetparser.h>
3
#include <libcorpus2/tagsetmanager.h>
4
#include <libcorpus2/token.h>
5

    
6
#include <libpwrutils/foreach.h>
7

    
8
#include <boost/program_options.hpp>
9
#include <boost/bind.hpp>
10
#include <boost/algorithm/string.hpp>
11

    
12

    
13
#include <algorithm>
14
#include <fstream>
15
#include <iterator>
16

    
17
/* Rudimentary tag checker. Reads tab-separated dictionary entries
18
 * (form lemma tagrepre) from stdin, outputs valid entries to stdout
19
 * and invalid entries prefixed with error descr to stderr.
20
 *
21
 * This code is a modification of Maca's tagset-tool. This may contain errors.
22
 * (modified by Adam Radziszewski). Licence: GNU GPL, see MACA docs.
23
 *
24
 * Compile: g++ -lmaca tag-checker.cpp -o tag-checker
25
 */
26

    
27
void std_read_loop(boost::function<void (const std::vector<std::string> &)>& line_cb)
28
{
29
        while (std::cin.good()) {
30
                std::string line;
31
                std::getline(std::cin, line);
32
                if (!line.empty()) {
33
                        std::vector<std::string> entry;
34
                        boost::algorithm::split(entry, line, boost::is_any_of("\t "));
35
                        if(entry.size() != 3)
36
                        {
37
                                std::cerr << "UNEXPECTED\t" << line << "\n";
38
                        }
39
                        else
40
                        {
41
                                line_cb(entry);
42
                        }
43
                }
44
        }
45
}
46

    
47

    
48
void tag_parse_cb(const Corpus2::Tagset& tagset, bool validate, bool sort,
49
                const std::vector<std::string> &entry, bool internals)
50
{
51
        const std::string &s = entry[2];
52
        bool ok = true;
53
        try {
54
                Corpus2::Token t;
55
                tagset.lexemes_into_token(t, UnicodeString(), s);
56
                foreach (const Corpus2::Lexeme& lex, t.lexemes()) {
57
                        if (validate) {
58
                                ok = ok and tagset.validate_tag(lex.tag(), false);
59
                        }
60
                }
61
                if(ok) {
62
                        std::cout << entry[0] << "\t" << entry[1] << "\t" << entry[2] << "\n";
63
                }
64
                else {
65
                        std::cerr << "INVALID\t" << entry[0] << "\t" << entry[1] << "\t" << entry[2] << "\n";
66
                }
67
        } catch (Corpus2::TagParseError& e) {
68
                std::cerr << e.info() << "\t" << entry[0] << "\t" << entry[1] << "\t" << entry[2] << "\n";
69
        }
70
}
71

    
72
int main(int argc, char** argv)
73
{
74
        std::string tagset_load;
75
        bool quiet = true, internals = false;
76
        bool validate = true, sort = false;
77
        using boost::program_options::value;
78

    
79
        boost::program_options::options_description desc("Allowed options");
80
        desc.add_options()
81
                        ("tagset,t", value(&tagset_load),
82
                         "Path to tagset ini file to load\n")
83
                        ("sort,s", value(&sort)->zero_tokens(),
84
                         "Sort parsed tags")
85
                        ("help,h", "Show help")
86
                        ;
87
        boost::program_options::variables_map vm;
88
        boost::program_options::positional_options_description p;
89
        p.add("tagset", -1);
90

    
91
        try {
92
                boost::program_options::store(
93
                        boost::program_options::command_line_parser(argc, argv)
94
                        .options(desc).positional(p).run(), vm);
95
        } catch (boost::program_options::error& e) {
96
                std::cerr << e.what() << "\n";
97
                return 2;
98
        }
99
        boost::program_options::notify(vm);
100

    
101
        if (vm.count("help")) {
102
                std::cout << desc << "\n";
103
                return 1;
104
        }
105

    
106
        Corpus2::Path::Instance().set_verbose(!quiet);
107

    
108
        if (!tagset_load.empty()) {
109
                try {
110
                        const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load);
111
                        boost::function<void (const std::vector<std::string> &)> f;
112
                        
113
                        
114
                        f = boost::bind(&tag_parse_cb, boost::ref(tagset), validate, sort, _1, internals);
115
                        
116
                        std_read_loop(f);
117
                } catch (Corpus2::Corpus2Error& e) {
118
                        std::cerr << "Error: " << e.info() << "\n";
119
                        return 4;
120
                }
121
        } else {
122
                std::cerr << "Usage: tag-ckecker [OPTIONS] <tagset-file>\n";
123
                std::cerr << "See tag-checker --help\n";
124
                return 1;
125
        }
126
        return 0;
127
}