Teuchos - Trilinos Tools Package Version of the Day
Loading...
Searching...
No Matches
Teuchos_regex.cpp
1// @HEADER
2// *****************************************************************************
3// Teuchos: Common Tools Package
4//
5// Copyright 2004 NTESS and the Teuchos contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#include "Teuchos_set.hpp"
11
12#include "Teuchos_regex.hpp"
13
14#include <iostream>
15#include <sstream>
16
17#include "Teuchos_Assert.hpp"
18#include "Teuchos_Parser.hpp"
19#include "Teuchos_vector.hpp"
20#include "Teuchos_string.hpp"
21#include "Teuchos_chartab.hpp"
22#include "Teuchos_Reader.hpp"
23#include "Teuchos_chartab.hpp"
24
25namespace Teuchos {
26namespace regex {
27
28Language make_language() {
29 /* The top produtions were from the "grep.y" YACC grammar in the source
30 code for Plan 9's grep utility, see here:
31https://github.com/wangeguo/plan9/blob/master/sys/src/cmd/grep/grep.y
32 The "set" related productions
33 are from a grammar intended to be used by ProLog to parse Perl's regular
34 expressions, see here:
35http://www.cs.sfu.ca/~cameron/Teaching/384/99-3/regexp-plg.html */
36 Language out;
37 Language::Productions& prods = out.productions;
38 prods.resize(NPRODS);
39 prods[PROD_REGEX]("regex") >> "union";
40 prods[PROD_UNION_DECAY]("union") >> "concat";
41 prods[PROD_UNION]("union") >> "union", "|", "concat"; // union
42 prods[PROD_CONCAT_DECAY]("concat") >> "qualified";
43 prods[PROD_CONCAT]("concat") >> "concat", "qualified"; // concatenation
44 prods[PROD_QUAL_DECAY]("qualified") >> "single";
45 prods[PROD_STAR]("qualified") >> "qualified", "*";
46 prods[PROD_PLUS]("qualified") >> "qualified", "+";
47 prods[PROD_MAYBE]("qualified") >> "qualified", "?";
48 prods[PROD_SINGLE_CHAR]("single") >> "char";
49 prods[PROD_ANY]("single") >> "."; // any
50 prods[PROD_SINGLE_SET]("single") >> "set";
51 prods[PROD_PARENS_UNION]("single") >> "(", "union", ")";
52 prods[PROD_SET_POSITIVE]("set") >> "positive-set";
53 prods[PROD_SET_NEGATIVE]("set") >> "negative-set";
54 prods[PROD_POSITIVE_SET]("positive-set") >> "[", "set-items", "]";
55 prods[PROD_NEGATIVE_SET]("negative-set") >> "[", "^", "set-items", "]";
56 prods[PROD_SET_ITEMS_DECAY]("set-items") >> "set-item";
57 prods[PROD_SET_ITEMS_ADD]("set-items") >> "set-items", "set-item";
58 prods[PROD_SET_ITEM_CHAR]("set-item") >> "char";
59 prods[PROD_SET_ITEM_RANGE]("set-item") >> "range";
60 prods[PROD_RANGE]("range") >> "char", "-", "char";
61 out.tokens.resize(NTOKS);
62 /* either one of the non-meta characters, or anything preceded by the escape slash */
63 out.tokens[TOK_CHAR]("char", "[^\\\\\\.\\[\\]\\(\\)\\|\\-\\^\\*\\+\\?]|\\\\.");
64 out.tokens[TOK_DOT](".", "\\.");
65 out.tokens[TOK_LRANGE]("[", "\\]");
66 out.tokens[TOK_RRANGE]("]", "\\]");
67 out.tokens[TOK_LPAREN]("(", "\\(");
68 out.tokens[TOK_RPAREN](")", "\\)");
69 out.tokens[TOK_UNION]("|", "\\|");
70 out.tokens[TOK_RANGE]("-", "\\-");
71 out.tokens[TOK_NEGATE]("^", "\\^");
72 out.tokens[TOK_STAR]("*", "\\*");
73 out.tokens[TOK_PLUS]("+", "\\+");
74 out.tokens[TOK_MAYBE]("?", "\\?");
75 return out;
76}
77
78/* bootstrap ! This lexer is used to build the ReaderTables that read
79 regular expressions themselves, so it can't depend on that reader ! */
80void make_lexer(FiniteAutomaton& result) {
81 std::string meta_chars_str = ".[]()|-^*+?";
82 std::set<int> all_chars;
83 for (int i = 0; i < NCHARS; ++i) all_chars.insert(i);
84 std::set<int> nonmeta_chars = all_chars;
85 for (int i = 0; i < Teuchos::size(meta_chars_str); ++i) {
86 int meta_char = at(meta_chars_str, i);
87 std::set<int>::iterator it = nonmeta_chars.find(get_symbol(meta_char));
88 nonmeta_chars.erase(it);
89 }
90 FiniteAutomaton lex_nonmeta;
91 make_set_nfa(lex_nonmeta, NCHARS, nonmeta_chars, TOK_CHAR);
92 FiniteAutomaton lex_slash;
93 make_char_single_nfa(lex_slash, '\\');
94 FiniteAutomaton lex_any;
95 make_set_nfa(lex_any, NCHARS, all_chars);
96 FiniteAutomaton lex_escaped;
97 concat(lex_escaped, lex_slash, lex_any, TOK_CHAR);
98 FiniteAutomaton lex_char;
99 unite(lex_char, lex_nonmeta, lex_escaped);
100 FiniteAutomaton lex_metachars;
101 for (int i = 0; i < Teuchos::size(meta_chars_str); ++i) {
102 int token = TOK_CHAR + i + 1;
103 if (i) {
104 FiniteAutomaton lex_metachar;
105 make_char_single_nfa(lex_metachar, at(meta_chars_str, i), token);
106 unite(lex_metachars, lex_metachars, lex_metachar);
107 } else {
108 make_char_single_nfa(lex_metachars, at(meta_chars_str, i), token);
109 }
110 }
111 unite(result, lex_metachars, lex_char);
112 make_deterministic(result, result);
113 simplify(result, result);
114}
115
116ReaderTablesPtr ask_reader_tables() {
117 static ReaderTablesPtr ptr;
118 if (ptr.strong_count() == 0) {
119 RCP<ReaderTables> newptr(new ReaderTables());
120 LanguagePtr lang = regex::ask_language();
121 GrammarPtr grammar = make_grammar(*lang);
122 newptr->parser = make_lalr1_parser(grammar);
123 regex::make_lexer(newptr->lexer);
124 newptr->indent_info.is_sensitive = false;
125 newptr->indent_info.indent_token = -1;
126 newptr->indent_info.dedent_token = -1;
127 ptr = newptr;
128 }
129 return ptr;
130}
131
132LanguagePtr ask_language() {
133 static LanguagePtr ptr;
134 if (ptr.strong_count() == 0) {
135 ptr.reset(new Language(make_language()));
136 }
137 return ptr;
138}
139
140void make_dfa(FiniteAutomaton& result, std::string const& name, std::string const& regex, int token) {
141 using std::swap;
142 regex::Reader reader(token);
143 any result_any;
144 try {
145 reader.read_string(result_any, regex, name);
146 } catch (const Teuchos::ParserFail& e) {
147 std::stringstream ss;
148 ss << e.what() << '\n';
149 ss << "error: couldn't build DFA for token \"" << name << "\" regex \"" << regex << "\"\n";
150 ss << "repeating with DebugReader:\n";
151 DebugReader debug_reader(regex::ask_reader_tables(), ss);
152 debug_reader.read_string(result_any, regex, name);
153 throw ParserFail(ss.str());
154 }
155 swap(any_ref_cast<FiniteAutomaton>(result_any), result);
156}
157
158regex::Reader::Reader(int result_token_in):
159 Teuchos::Reader(regex::ask_reader_tables()),
160 result_token(result_token_in) {
161}
162
163void regex::Reader::at_shift(any& result, int token, std::string& text) {
164 if (token != TOK_CHAR) return;
165 if (Teuchos::size(text) == 1) {
166 result = text[0];
167 } else if (Teuchos::size(text) == 2) {
168 TEUCHOS_ASSERT(text[0] == '\\');
169 result = text[1];
170 } else {
171 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
172 "BUG: regex char text is \"" << text << "\"\n");
173 }
174}
175
176void regex::Reader::at_reduce(any& result_any, int production, std::vector<any>& rhs) {
177 using std::swap;
178 switch (production) {
179 case PROD_REGEX: {
180 swap(result_any, at(rhs, 0));
181 FiniteAutomaton& result = any_ref_cast<FiniteAutomaton>(result_any);
182 make_deterministic(result, result);
183 simplify(result, result);
184 return;
185 }
186 case PROD_UNION_DECAY:
187 case PROD_CONCAT_DECAY:
188 case PROD_QUAL_DECAY:
189 case PROD_SET_ITEMS_DECAY:
190 case PROD_SET_ITEM_RANGE: {
191 swap(result_any, at(rhs, 0));
192 return;
193 }
194 case PROD_UNION: {
195 FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
196 FiniteAutomaton& a = any_ref_cast<FiniteAutomaton>(at(rhs, 0));
197 FiniteAutomaton& b = any_ref_cast<FiniteAutomaton>(at(rhs, 2));
198 unite(result, a, b);
199 return;
200 }
201 case PROD_CONCAT: {
202 FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
203 FiniteAutomaton& a = any_ref_cast<FiniteAutomaton>(at(rhs, 0));
204 FiniteAutomaton& b = any_ref_cast<FiniteAutomaton>(at(rhs, 1));
205 concat(result, a, b, result_token);
206 return;
207 }
208 case PROD_STAR: {
209 FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
210 FiniteAutomaton& a = any_ref_cast<FiniteAutomaton>(at(rhs, 0));
211 star(result, a, result_token);
212 return;
213 }
214 case PROD_PLUS: {
215 FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
216 FiniteAutomaton& a = any_ref_cast<FiniteAutomaton>(at(rhs, 0));
217 plus(result, a, result_token);
218 return;
219 }
220 case PROD_MAYBE: {
221 FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
222 FiniteAutomaton& a = any_ref_cast<FiniteAutomaton>(at(rhs, 0));
223 maybe(result, a, result_token);
224 return;
225 }
226 case PROD_SINGLE_CHAR: {
227 FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
228 char c = any_cast<char>(at(rhs, 0));
229 make_char_single_nfa(result, c, result_token);
230 return;
231 }
232 case PROD_ANY: {
233 FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
234 make_range_nfa(result, NCHARS, 0, NCHARS - 1, result_token);
235 return;
236 }
237 case PROD_SINGLE_SET: {
238 FiniteAutomaton& result = make_any_ref<FiniteAutomaton>(result_any);
239 std::set<char>& charset = any_ref_cast<std::set<char> >(at(rhs, 0));
240 make_char_set_nfa(result, charset, result_token);
241 return;
242 }
243 case PROD_PARENS_UNION: {
244 swap(result_any, at(rhs, 1));
245 return;
246 }
247 case PROD_SET_POSITIVE: {
248 swap(result_any, at(rhs, 0));
249 return;
250 }
251 case PROD_SET_NEGATIVE: {
252 std::set<char>& result = make_any_ref<std::set<char> >(result_any);
253 std::set<char> const& charset = any_ref_cast<std::set<char> >(at(rhs, 0));
254 negate_set(result, charset);
255 return;
256 }
257 case PROD_POSITIVE_SET: {
258 swap(result_any, at(rhs, 1));
259 return;
260 }
261 case PROD_NEGATIVE_SET: {
262 swap(result_any, at(rhs, 2));
263 return;
264 }
265 case PROD_SET_ITEMS_ADD: {
266 std::set<char>& result = make_any_ref<std::set<char> >(result_any);
267 std::set<char>& a = any_ref_cast<std::set<char> >(at(rhs, 0));
268 std::set<char> const& b = any_ref_cast<std::set<char> >(at(rhs, 1));
269 swap(result, a);
270 unite_with(result, b);
271 return;
272 }
273 case PROD_SET_ITEM_CHAR: {
274 std::set<char>& result = make_any_ref<std::set<char> >(result_any);
275 char c = any_cast<char>(at(rhs, 0));
276 result.insert(c);
277 return;
278 }
279 case PROD_RANGE: {
280 std::set<char>& result = make_any_ref<std::set<char> >(result_any);
281 char a = any_cast<char>(at(rhs, 0));
282 char b = any_cast<char>(at(rhs, 2));
283 for (char c = a; c <= b; ++c) {
284 result.insert(c);
285 }
286 return;
287 }
288 }
289 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
290 "BUG: unexpected production " << production << '\n');
291}
292
293} // namespace regex
294} // namespace Teuchos
Declares Teuchos::Parser, ParserFail and make_lalr1_parser.
Declares Teuchos::Reader.
Tries to create LALR(1) parser tables for a given grammar.
void reset()
Reset to null.
#define TEUCHOS_ASSERT(assertion_test)
This macro is throws when an assert fails.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
Macro for throwing an exception with breakpointing to ease debugging.
The Teuchos namespace contains all of the classes, structs and enums used by Teuchos,...
RCP< const ReaderTables > ReaderTablesPtr
an RCP to a const ReaderTables
void make_lexer(FiniteAutomaton &result, Language const &language)
construct a lexer for the Language tokens.
RCP< const Language > LanguagePtr
an RCP to a const Language
Parser make_lalr1_parser(GrammarPtr grammar, bool verbose)
Tries to create LALR(1) parser tables for a given grammar.
Productions productions
vector of productions