this is a patch for a modification to the modure.c file that implements sub and other convenient features (I hope).
Code: Select all
diff --git "a/C:\\Users\\Damiano\\AppData\\Local\\Temp\\TortoiseGit\\modure-74ab341.000.c" "b/C:\\msys64_micropython\\home\\dma\\projects\\micropython\\extmod\\modure.c"
index 31c2b98..544262e 100644
--- "a/C:\\Users\\Damiano\\AppData\\Local\\Temp\\TortoiseGit\\modure-74ab341.000.c"
+++ "b/C:\\msys64_micropython\\home\\dma\\projects\\micropython\\extmod\\modure.c"
@@ -40,6 +40,7 @@
#include "re1.5/re1.5.h"
#define FLAG_DEBUG 0x1000
+#define FLAG_IGNORECASE 0x0100
typedef struct _mp_obj_re_t {
mp_obj_base_t base;
@@ -77,8 +78,93 @@ STATIC mp_obj_t match_group(mp_obj_t self_in, mp_obj_t no_in) {
}
MP_DEFINE_CONST_FUN_OBJ_2(match_group_obj, match_group);
+#if MICROPY_PY_URE_MATCH_GROUPS
+
+STATIC mp_obj_t match_groups(mp_obj_t self_in) {
+ mp_obj_match_t *self = MP_OBJ_TO_PTR(self_in);
+ size_t no_items = (self->num_matches > 1? self->num_matches-1: 0);
+ if (no_items) {
+ mp_obj_tuple_t *groups = mp_obj_new_tuple(no_items, NULL);
+ for(size_t no = 1; no < self->num_matches; no++) {
+ const char *start = self->caps[no * 2];
+ if (start == NULL) {
+ // no match for this group
+ groups->items[no-1] = mp_const_none;
+ } else {
+ groups->items[no-1] = mp_obj_new_str_of_type(mp_obj_get_type(self->str),
+ (const byte*)start, self->caps[no * 2 + 1] - start);
+ }
+ }
+ return groups;
+ } else {
+ return mp_const_none;
+ }
+}
+MP_DEFINE_CONST_FUN_OBJ_1(match_groups_obj, match_groups);
+
+#endif
+
+#if MICROPY_PY_URE_MATCH_SPAN_START_END
+
+STATIC mp_obj_t match_span(size_t n_args, const mp_obj_t *args) {
+ mp_obj_match_t *self = MP_OBJ_TO_PTR(args[0]);
+ mp_int_t no = (n_args == 2 ? mp_obj_get_int(args[1]) : 0);
+ if (no < 0 || no >= self->num_matches) {
+ nlr_raise(mp_obj_new_exception_arg1(&mp_type_IndexError, mp_obj_new_int(no)));
+ }
+
+ const char *start = self->caps[no * 2];
+ if (start == NULL) {
+ // no match for this group
+ mp_obj_t tuple[2] = { mp_obj_new_int(-1), mp_obj_new_int(-1) };
+ return mp_obj_new_tuple(2, tuple);
+ }
+
+ Subject subj;
+ size_t len;
+ subj.begin = mp_obj_str_get_data(self->str, &len);
+ subj.end = subj.begin + len;
+
+ mp_obj_t tuple[2] = {
+ mp_obj_new_int(start - subj.begin),
+ mp_obj_new_int(self->caps[no * 2 + 1] - subj.begin)
+ };
+ return mp_obj_new_tuple(2, tuple);
+}
+MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(match_span_obj, 1, 2, match_span);
+
+STATIC mp_obj_t match_end(size_t n_args, const mp_obj_t *args) {
+ mp_obj_t bases_tuple = match_span(n_args, args);
+ size_t len;
+ mp_obj_t *items;
+ mp_obj_tuple_get(bases_tuple, &len, &items);
+ return items[1];
+}
+
+MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(match_end_obj, 1, 2, match_end);
+
+STATIC mp_obj_t match_start(size_t n_args, const mp_obj_t *args) {
+ mp_obj_t bases_tuple = match_span(n_args, args);
+ size_t len;
+ mp_obj_t *items;
+ mp_obj_tuple_get(bases_tuple, &len, &items);
+ return items[0];
+}
+
+MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(match_start_obj, 1, 2, match_start);
+
+#endif
+
STATIC const mp_rom_map_elem_t match_locals_dict_table[] = {
{ MP_ROM_QSTR(MP_QSTR_group), MP_ROM_PTR(&match_group_obj) },
+#if MICROPY_PY_URE_MATCH_GROUPS
+ { MP_ROM_QSTR(MP_QSTR_groups), MP_ROM_PTR(&match_groups_obj) },
+ #endif
+#if MICROPY_PY_URE_MATCH_SPAN_START_END
+ { MP_ROM_QSTR(MP_QSTR_span), MP_ROM_PTR(&match_span_obj) },
+ { MP_ROM_QSTR(MP_QSTR_start), MP_ROM_PTR(&match_start_obj) },
+ { MP_ROM_QSTR(MP_QSTR_end), MP_ROM_PTR(&match_end_obj) },
+#endif
};
STATIC MP_DEFINE_CONST_DICT(match_locals_dict, match_locals_dict_table);
@@ -174,10 +260,213 @@ STATIC mp_obj_t re_split(size_t n_args, const mp_obj_t *args) {
}
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_split_obj, 2, 3, re_split);
+#if MICROPY_PY_URE_SUB
+// str_to_int ... from objstr.c
+STATIC int str_to_int(const char *str, int *num) {
+ const char *s = str;
+ if ('0' <= *s && *s <= '9') {
+ *num = 0;
+ do {
+ *num = *num * 10 + (*s - '0');
+ s++;
+ } while ('0' <= *s && *s <= '9');
+ }
+ return s - str;
+}
+
+STATIC mp_obj_t ure_exec_sub(mp_obj_re_t *self, bool is_anchored, mp_obj_t replace, mp_obj_t where, mp_int_t count, mp_int_t flags) {
+ mp_buffer_info_t bufinfo;
+ mp_get_buffer_raise(where, &bufinfo, MP_BUFFER_READ);
+ bool debug = (flags & FLAG_DEBUG);
+ (void)debug;
+ bool ignorecase = (flags & FLAG_IGNORECASE);
+ (void)ignorecase;
+ Subject subj;
+ subj.begin = bufinfo.buf;
+ subj.end = subj.begin + bufinfo.len;
+ int caps_num = (self->re.sub + 1) * 2;
+ int pre_m_l = -1, post_m_l = -1;
+ int num_sub = 0;
+ const char* where_str = subj.begin;
+
+ vstr_t *vstr_return = NULL;
+ if ((vstr_return = vstr_new(0)) != NULL) {
+ while (true) {
+
+ mp_obj_match_t *match = m_new_obj_var(mp_obj_match_t, char*, caps_num);
+ // cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char
+ memset((char*)match->caps, 0, caps_num * sizeof(char*));
+ int res = re1_5_recursiveloopprog(&self->re, &subj, match->caps, caps_num, is_anchored);
+
+ // if we didn't have a match, or had an empty match, it's time to stop
+ if (!res || match->caps[0] == match->caps[1]) {
+ break;
+ }
+ else {
+
+ // process match
+ match->base.type = &match_type;
+ match->num_matches = caps_num / 2; // caps_num counts start and end pointers
+ match->str = where;
+
+ const char* replace_str = mp_obj_str_get_str((mp_obj_is_callable(replace) ? mp_call_function_1(replace, match) : replace));
+
+ // add pre-match string
+ pre_m_l = (match->caps[0] - subj.begin);
+ if (pre_m_l != -1) {
+ vstr_add_strn(vstr_return, subj.begin, pre_m_l);
+ }
+
+ vstr_t *vstr_repl = NULL;
+ if ((vstr_repl = vstr_new(0)) != NULL) {
+ vstr_add_str(vstr_repl, replace_str);
+ const char *repl_p = vstr_null_terminated_str(vstr_repl);
+ do {
+ const char *group = NULL, *start_group = NULL, *end_group = NULL;
+ int match_no = -1, is_group_number = -1;
+ if (*repl_p == '\\') {
+ start_group = repl_p;
+ group = ++repl_p;
+
+ if (*group != 0 && *group == 'g') {
+ // search group with syntax "\g<number>"
+ const char *left_angle_bracket = ++group;
+ if (left_angle_bracket != 0 && *left_angle_bracket == '<') {
+ const char *value = ++left_angle_bracket;
+ if (value != 0) {
+
+ if (unichar_isalpha(*value)) {
+ mp_raise_NotImplementedError("group with syntax \"\\g<name>\"");
+ }
+
+ int value_l = str_to_int(value, &match_no);
+ if (match_no == -1) {
+ nlr_raise(mp_obj_new_exception_msg(&mp_type_RuntimeError, "missing group number"));
+ }
+
+ const char *right_angle_bracket = value += value_l;
+ if (right_angle_bracket != 0 && *right_angle_bracket == '>' && match_no < match->num_matches) {
+ is_group_number = 1;
+ end_group = value + 1;
+ }
+ }
+ }
+
+ }
+ else if (group != 0 && unichar_isdigit(*group)) {
+ // search group with syntax "\number"
+ const char *value = group;
+ int value_l = str_to_int(value, &match_no);
+ if (match_no > -1 && match_no < match->num_matches) {
+ is_group_number = 0;
+ end_group = value += value_l;
+ }
+ }
+
+ if (match_no > -1 && is_group_number > -1 && start_group != NULL && end_group != NULL) {
+
+ const char *start_match = match->caps[match_no * 2];
+ if (start_match == NULL) {
+ // no match for this group
+ return where;
+ }
+
+ size_t mg_l = (match->caps[match_no * 2 + 1] - start_match);
+ size_t gv_l = (end_group - start_group);
+
+ if (gv_l < mg_l) {
+ vstr_add_len(vstr_repl, (mg_l - gv_l));
+ repl_p += gv_l;
+ }
+ else if (gv_l > mg_l) {
+ vstr_cut_out_bytes(vstr_repl, (start_group - replace_str), (gv_l - mg_l));
+ repl_p -= mg_l;
+ }
+
+ // replace the substring matched by group
+ memmove((char *)(start_group + mg_l), (start_group + gv_l), strlen(start_group));
+ memcpy((char *)(start_group), start_match, mg_l);
+
+ }
+ }
+
+ } while (*(++repl_p) != 0);
+
+ // add replace
+ vstr_add_str(vstr_return, vstr_str(vstr_repl));
+
+ // post-match string
+ post_m_l = (match->caps[1] - where_str);
+
+ vstr_free(vstr_repl);
+
+ num_sub++;
+ }
+
+ }
+
+ subj.begin = match->caps[1];
+ subj.end = subj.begin + strlen(match->caps[1]);
+
+ if (count > 0 && --count == 0) {
+ break;
+ }
+
+ m_del_var(mp_obj_match_t, char*, match->num_matches, match);
+
+ }
+
+ }
+
+ // add post-match string
+ if (post_m_l != -1) {
+ vstr_add_str(vstr_return, &where_str[post_m_l]);
+ }
+
+ const mp_obj_type_t *str_type = mp_obj_get_type(where);
+ return (!num_sub ? where : mp_obj_new_str_from_vstr(str_type, vstr_return));
+}
+
+STATIC mp_obj_t re_sub_helper(bool is_anchored, mp_obj_re_t *self, size_t n_args, const mp_obj_t *args, mp_map_t *kwargs) {
+ mp_obj_t replace = args[1];
+ mp_obj_t where = args[2];
+ mp_int_t count = 0;
+ mp_int_t flags = 0;
+ if (n_args > 3 && MP_OBJ_IS_INT(args[3])) {
+ count = mp_obj_get_int(args[3]);
+ } else {
+ mp_map_elem_t *count_elem = mp_map_lookup(kwargs, MP_OBJ_NEW_QSTR(MP_QSTR_count), MP_MAP_LOOKUP);
+ if (count_elem != NULL && MP_OBJ_IS_INT(count_elem->value)) {
+ count = mp_obj_get_int(count_elem->value);
+ }
+ }
+ if (n_args > 4 && MP_OBJ_IS_INT(args[4])) {
+ flags = mp_obj_get_int(args[4]);
+ } else {
+ mp_map_elem_t *flags_elem = mp_map_lookup(kwargs, MP_OBJ_NEW_QSTR(MP_QSTR_flags), MP_MAP_LOOKUP);
+ if (flags_elem != NULL && MP_OBJ_IS_INT(flags_elem->value)) {
+ flags = mp_obj_get_int(flags_elem->value);
+ }
+ }
+ return ure_exec_sub(self, is_anchored, replace, where, count, flags);
+}
+
+STATIC mp_obj_t re_sub(size_t n_args, const mp_obj_t *args, mp_map_t *kwargs) {
+ mp_obj_re_t *self = MP_OBJ_TO_PTR(args[0]);
+ return re_sub_helper(false, self, n_args, args, kwargs);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(re_sub_n_obj, 3, re_sub);
+#endif
+
+
STATIC const mp_rom_map_elem_t re_locals_dict_table[] = {
{ MP_ROM_QSTR(MP_QSTR_match), MP_ROM_PTR(&re_match_obj) },
{ MP_ROM_QSTR(MP_QSTR_search), MP_ROM_PTR(&re_search_obj) },
{ MP_ROM_QSTR(MP_QSTR_split), MP_ROM_PTR(&re_split_obj) },
+#if MICROPY_PY_URE_SUB
+ { MP_OBJ_NEW_QSTR(MP_QSTR_sub), MP_ROM_PTR(&re_sub_n_obj) },
+#endif
};
STATIC MP_DEFINE_CONST_DICT(re_locals_dict, re_locals_dict_table);
@@ -232,11 +521,24 @@ STATIC mp_obj_t mod_re_search(size_t n_args, const mp_obj_t *args) {
}
MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_re_search_obj, 2, 4, mod_re_search);
+#if MICROPY_PY_URE_SUB
+STATIC mp_obj_t mod_re_sub(size_t n_args, const mp_obj_t *args, mp_map_t *kwargs) {
+ mp_obj_re_t *self = mod_re_compile(1, args);
+ return re_sub_helper(false, self, n_args, args, kwargs);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(mod_re_sub_obj, 3, mod_re_sub);
+#endif
+
STATIC const mp_rom_map_elem_t mp_module_re_globals_table[] = {
{ MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_ure) },
{ MP_ROM_QSTR(MP_QSTR_compile), MP_ROM_PTR(&mod_re_compile_obj) },
{ MP_ROM_QSTR(MP_QSTR_match), MP_ROM_PTR(&mod_re_match_obj) },
{ MP_ROM_QSTR(MP_QSTR_search), MP_ROM_PTR(&mod_re_search_obj) },
+#if MICROPY_PY_URE_SUB
+ { MP_OBJ_NEW_QSTR(MP_QSTR_sub), MP_ROM_PTR(&mod_re_sub_obj) },
+ { MP_ROM_QSTR(MP_QSTR_IGNORECASE), MP_ROM_INT(FLAG_IGNORECASE) },
+#endif
{ MP_ROM_QSTR(MP_QSTR_DEBUG), MP_ROM_INT(FLAG_DEBUG) },
};