22#define LOG_SURGEON_LEXER_HPP
33
44#include < array>
5- #include < bitset>
65#include < cstdint>
76#include < memory>
7+ #include < optional>
88#include < string>
99#include < unordered_map>
10- #include < unordered_set >
10+ #include < utility >
1111#include < vector>
1212
1313#include < log_surgeon/Constants.hpp>
1414#include < log_surgeon/finite_automata/Dfa.hpp>
1515#include < log_surgeon/finite_automata/DfaState.hpp>
16- #include < log_surgeon/finite_automata/Nfa.hpp>
1716#include < log_surgeon/finite_automata/RegexAST.hpp>
1817#include < log_surgeon/LexicalRule.hpp>
1918#include < log_surgeon/ParserInputBuffer.hpp>
2019#include < log_surgeon/Token.hpp>
20+ #include < log_surgeon/types.hpp>
2121
2222namespace log_surgeon {
2323template <typename TypedNfaState, typename TypedDfaState>
@@ -35,13 +35,11 @@ class Lexer {
3535
3636 /* *
3737 * Add lexical rule to the lexer's list of rules
38- * @param id
39- * @param regex
38+ * @param rule_id
39+ * @param rule
4040 */
41- auto add_rule (
42- uint32_t const & id,
43- std::unique_ptr<finite_automata::RegexAST<TypedNfaState>> rule
44- ) -> void;
41+ auto add_rule (rule_id_t rule_id, std::unique_ptr<finite_automata::RegexAST<TypedNfaState>> rule)
42+ -> void;
4543
4644 /* *
4745 * Return regex pattern for a rule name
@@ -51,7 +49,8 @@ class Lexer {
5149 auto get_rule (uint32_t variable_id) -> finite_automata::RegexAST<TypedNfaState>*;
5250
5351 /* *
54- * Generate DFA for lexer
52+ * Generate DFA for lexer.
53+ * @throw std::invalid_argument if `m_rules` contains multipe captures with the same name.
5554 */
5655 auto generate () -> void;
5756
@@ -122,8 +121,75 @@ class Lexer {
122121 return m_dfa;
123122 }
124123
125- std::unordered_map<std::string, uint32_t > m_symbol_id;
126- std::unordered_map<uint32_t , std::string> m_id_symbol;
124+ /* *
125+ * @param rule_id ID associated with a rule.
126+ * @return A vector of capture IDs corresponding to each rule that contain the variable on
127+ * success.
128+ * @return std::nullopt if the variable is never captured in any rule.
129+ */
130+ [[nodiscard]] auto get_capture_ids_from_rule_id (rule_id_t const rule_id
131+ ) const -> std::optional<std::vector<capture_id_t>> {
132+ if (m_rule_id_to_capture_ids.contains (rule_id)) {
133+ return m_rule_id_to_capture_ids.at (rule_id);
134+ }
135+ return std::nullopt ;
136+ }
137+
138+ /* *
139+ * @param capture_id ID associated with a capture within a rule.
140+ * @return The start and end tag of the capture on success.
141+ * @return std::nullopt if no capture is associated with the given capture ID.
142+ */
143+ [[nodiscard]] auto get_tag_id_pair_from_capture_id (capture_id_t const capture_id
144+ ) const -> std::optional<std::pair<tag_id_t, tag_id_t>> {
145+ if (m_capture_id_to_tag_id_pair.contains (capture_id)) {
146+ return m_capture_id_to_tag_id_pair.at (capture_id);
147+ }
148+ return std::nullopt ;
149+ }
150+
151+ /* *
152+ * @param tag_id ID associated with a tag.
153+ * @return The final register ID tracking the value of the tag ID during DFA simulation on
154+ * success.
155+ * @return std::nullopt if no tag is associated with the given tag ID.
156+ */
157+ [[nodiscard]] auto get_reg_id_from_tag_id (tag_id_t const tag_id
158+ ) const -> std::optional<reg_id_t> {
159+ if (m_tag_to_reg_id.contains (tag_id)) {
160+ return m_tag_to_reg_id.at (tag_id);
161+ }
162+ return std::nullopt ;
163+ }
164+
165+ /* *
166+ * @param capture_id ID associated with a capture within a rule.
167+ * @return The start and end final register IDs tracking the position of the capture on success.
168+ * @return std::nullopt if no capture is associated with the given capture ID.
169+ */
170+ [[nodiscard]] auto get_reg_ids_from_capture_id (capture_id_t const capture_id
171+ ) const -> std::optional<std::pair<reg_id_t, reg_id_t>> {
172+ auto const optional_tag_id_pair{get_tag_id_pair_from_capture_id (capture_id)};
173+ if (false == optional_tag_id_pair.has_value ()) {
174+ return std::nullopt ;
175+ }
176+ auto const [start_tag_id, end_tag_id]{optional_tag_id_pair.value ()};
177+
178+ auto const optional_start_reg_id{get_reg_id_from_tag_id (start_tag_id)};
179+ if (false == optional_start_reg_id.has_value ()) {
180+ return std::nullopt ;
181+ }
182+
183+ auto const optional_end_reg_id{get_reg_id_from_tag_id (end_tag_id)};
184+ if (false == optional_end_reg_id.has_value ()) {
185+ return std::nullopt ;
186+ }
187+
188+ return {optional_start_reg_id.value (), optional_end_reg_id.value ()};
189+ }
190+
191+ std::unordered_map<std::string, rule_id_t > m_symbol_id;
192+ std::unordered_map<rule_id_t , std::string> m_id_symbol;
127193
128194private:
129195 /* *
@@ -148,6 +214,9 @@ class Lexer {
148214 std::unique_ptr<finite_automata::Dfa<TypedDfaState>> m_dfa;
149215 bool m_asked_for_more_data{false };
150216 TypedDfaState const * m_prev_state{nullptr };
217+ std::unordered_map<rule_id_t , std::vector<capture_id_t >> m_rule_id_to_capture_ids;
218+ std::unordered_map<capture_id_t , std::pair<tag_id_t , tag_id_t >> m_capture_id_to_tag_id_pair;
219+ std::unordered_map<tag_id_t , reg_id_t > m_tag_to_reg_id;
151220};
152221
153222namespace lexers {
0 commit comments