- 
                Notifications
    You must be signed in to change notification settings 
- Fork 602
toke.c: S_intuit_more: More effectively use S_scan_ident #23880
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: blead
Are you sure you want to change the base?
Changes from all commits
3b86b9c
              b302a19
              eb6d467
              5eeec5e
              2aeccb9
              5d79145
              26848f0
              7eb08b7
              b61e5b8
              ac4d84e
              5e9f082
              9f31304
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -4590,10 +4590,7 @@ S_intuit_more(pTHX_ char *s, char *e, | |
| * written, and regcurly never required a comma, as in {0}. Probably it is | ||
| * ok as-is */ | ||
| if (s[0] == '{') { | ||
| if (regcurly(s, e, NULL)) { | ||
| return FALSE; | ||
| } | ||
| return TRUE; | ||
| return ! regcurly(s, e, NULL); | ||
| } | ||
|  | ||
| /* Here is '[': maybe we have a character class. Examine the guts */ | ||
|  | @@ -4735,72 +4732,147 @@ S_intuit_more(pTHX_ char *s, char *e, | |
| * looks for. | ||
| * | ||
| */ | ||
| if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) { | ||
| Size_t len; | ||
|  | ||
| /* khw: where did the magic number 4 come from?. This buffer | ||
| * was 4 times as large as tokenbuf in 1997, and had not | ||
| * changed since the code was first added */ | ||
| char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) * 4 ]; | ||
| /* (Reserve tmpbuf[0] for future commits, hence +1) */ | ||
| char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) + 1 ]; | ||
| char * s_after_ident; | ||
|  | ||
| /* scan_ident returns NULL if the input looks like an identifier | ||
| * that is illegal, e.g., it is too long or is like $001. */ | ||
| s_after_ident = scan_ident(s, tmpbuf + 1, C_ARRAY_END(tmpbuf), | ||
| CHECK_ONLY); | ||
| if (s_after_ident == NULL) { | ||
|  | ||
| /* An illegal identifier means this can't be a subscript; | ||
| * it's an error or it could be a charclass */ | ||
| return false; | ||
| } | ||
|  | ||
| Size_t len; /* (C++ forbids joining these 2 lines) */ | ||
| len = strlen(tmpbuf + 1); | ||
|  | ||
| /* If it doesn't look like an identifier at all, scan_ident will | ||
| * set tmpbuf[1] to NUL. This is either an error or a character | ||
| * class. */ | ||
| if (len == 0) { | ||
| return false; | ||
| } | ||
|  | ||
| /* If there is extra stuff in the source, like braces, it means | ||
| * this is almost definitely intended to be an identifier */ | ||
| bool decorated; | ||
| decorated = (Size_t) (s_after_ident - s) > len; | ||
|  | ||
| if (isDIGIT_A(tmpbuf[1])) { | ||
|  | ||
| /* &41 and &4b are illegal subroutine names so is an error or | ||
| * a charclass */ | ||
| if (s[0] == '&') { | ||
| return false; | ||
| } | ||
|  | ||
| if (! scan_ident(s, tmpbuf, C_ARRAY_END(tmpbuf), CHECK_ONLY)) | ||
| /* Here, matches [$@]\d+. If the next input character is a | ||
| * \w, we would have something like $456x, which is an illegal | ||
| * identifer, so is an error or a charclass */ | ||
| if ( ! decorated | ||
| && isWORDCHAR_lazy_if_safe(s_after_ident, | ||
| PL_bufend, UTF)) | ||
| { | ||
| /* An illegal identifier means this can't be a subscript; | ||
| * it's an error or it could be a charclass */ | ||
| return false; | ||
| } | ||
|  | ||
| len = strlen(tmpbuf); | ||
| /* We don't get here if this potential identifier starts with | ||
| * leading zeros, due to the logic in scan_ident. */ | ||
| assert(len == 1 || tmpbuf[0] != '0'); | ||
|  | ||
| /* khw: This only looks at global variables; lexicals came | ||
| * later, and this hasn't been updated. Ouch!! */ | ||
| if ( len > 1 | ||
| && gv_fetchpvn_flags(tmpbuf, | ||
| len, | ||
| UTF ? SVf_UTF8 : 0, | ||
| SVt_PV)) | ||
| /* The chances are vanishingly small that someone is going to | ||
| * want [$0] to expand to the program's name in a character | ||
| * class. But, what would the program's name be doing as part | ||
| * of a subscript either? The only likely scenario is that | ||
| * this is meant to be a charclass matching either '$' or '0'. | ||
| * */ | ||
| if (tmpbuf[1] == '0') { | ||
| return false; | ||
| } | ||
|  | ||
| /* Here it is either something like $1 which is supposed to | ||
| * match either dollar or 1, or it is supposed to expand to | ||
| * what is in $1 left over from a capturing group from the | ||
| * previous pattern match. In the latter case, it could be | ||
| * either a part of wanting to calculate a subscript, or to | ||
| * use as the contents of as part of the character class. | ||
| * Larger (undecorated) numbers are much less likely to have | ||
| * had capturing groups, so they lean more towards a | ||
| * charclass. 100 is what this function has traditionally | ||
| * used for len>1; khw thinks there is no bias one way or the | ||
| * other for length 1 ones. But has chosen 100 for decorated | ||
| * identifiers | ||
| * | ||
| * XXX long enough identifiers could probably return false | ||
| * immediately here, rather than using weights. */ | ||
| if (decorated || len > 1) { | ||
| weight -= 100; | ||
| } | ||
| } | ||
| else if (len > 1) { | ||
| /* See if there is a known identifier of the given kind. For | ||
| * arrays, this might also be a reference to one of its | ||
| * elements. XXX Maybe the latter should require a following | ||
| * '[' */ | ||
| if ( is_existing_identifier(tmpbuf, len, s[0], UTF) | ||
| || ( s[0] == '$' | ||
| && is_existing_identifier(tmpbuf, len, '@', UTF))) | ||
| { | ||
| weight -= 100; | ||
|  | ||
| /* khw: Below we keep track of repeated characters; People | ||
| * rarely say qr/[aba]/, as the second a is pointless. | ||
| * (Some do it though as a mnemonic that is meaningful to | ||
| * them.) But generally, repeated characters make things | ||
| * more likely to be a charclass. But here, this an | ||
| * identifier so likely a subscript. Its spelling should | ||
| * be irrelevant to the repeated characters test. So, we | ||
| * should advance past it. Suppose it is a hash element, | ||
| * like $subscripts{$which}. We should advance past the | ||
| * braces and key */ | ||
| /* khw: Below we keep track of repeated characters; | ||
| * People rarely say qr/[aba]/, as the second a is | ||
| * pointless. (Some do it though as a mnemonic that is | ||
| * meaningful to them.) But generally, repeated characters | ||
| * make things more likely to be a charclass. But here, | ||
| * this an identifier so likely a subscript. Its spelling | ||
| * should be irrelevant to the repeated characters test. | ||
| * So, we should advance past it. Suppose it is a hash | ||
| * element, like $subscripts{$which}. We should advance | ||
| * past the braces and key */ | ||
| } | ||
| else { | ||
| /* Not a multi-char identifier already known in the | ||
| * program; is somewhat likely to be a subscript. | ||
| * | ||
| * khw: Our test suite contains several constructs like | ||
| * [$A-Z]. Excluding length 1 identifiers in the | ||
| * conditional above means such are much less likely to be | ||
| * mistaken for subscripts. I would argue that if the next | ||
| * character is a '-' followed by an alpha, that would make | ||
| * it much more likely to be a charclass. It would only | ||
| * make sense to be an expression if that alpha string is a | ||
| * bareword with meaning; something like [$A-ord] */ | ||
| else { /* Isn't a known identifier */ | ||
| /* Under strict, this means an error. */ | ||
| if (under_strict_vars) { | ||
| return false; | ||
| } | ||
|  | ||
| /* Otherwise still somewhat likely to be a subscript */ | ||
| weight -= 10; | ||
| } | ||
| } | ||
| else if ( s[0] == '$' | ||
| && s[1] | ||
| && memCHRs("[#!%*<>()-=", s[1])) | ||
| else if ( len == 1 | ||
| && s[0] == '$' | ||
| && memCHRs("[#!%*<>()-=", tmpbuf[1])) | ||
| { | ||
| /* Here we have what could be a punctuation variable. If the | ||
| * next character after it is a closing bracket, it makes it | ||
| * quite likely to be that, and hence a subscript. If it is | ||
| * something else, more mildly a subscript */ | ||
| if (/*{*/ memCHRs("])} =", s[2])) | ||
| if (/*{*/ memCHRs("])} =", tmpbuf[2])) | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. accidentally deleted previous comment :( little bit confused: this happens in branch when strlen of tmpbuf is already checked to be 1, so tmpbuf[2] should \0 | ||
| weight -= 10; | ||
| else | ||
| weight -= 1; | ||
| } | ||
| else { | ||
| /* Not a multi-char identifier already known in the program; | ||
| * is somewhat likely to be a subscript. | ||
| * | ||
| * khw: Our test suite contains several constructs like | ||
| * [$A-Z]. Excluding length 1 identifiers in the conditional | ||
| * above means such are much less likely to be mistaken for | ||
| * subscripts. I would argue that if the next character is a | ||
| * '-' followed by an alpha, that would make it much more | ||
| * likely to be a charclass. It would only make sense to be | ||
| * an expression if that alpha string is a bareword with | ||
| * meaning; something like [$A-ord] */ | ||
| weight -= 10; | ||
| } | ||
| break; | ||
|  | ||
| /* khw: [:blank:] strongly indicates a charclass */ | ||
|  | @@ -4812,66 +4884,64 @@ S_intuit_more(pTHX_ char *s, char *e, | |
| * \? must be subscript for things like \d, but not \a. | ||
| */ | ||
|  | ||
|  | ||
| case '\\': | ||
| if (s[1]) { | ||
| if (memCHRs("wds]", s[1])) { | ||
| weight += 100; /* \w \d \s => strongly charclass */ | ||
| /* khw: \] can't happen, as any ']' is beyond our search. | ||
| * Why not \W \D \S \h \v, etc as well? Should they have | ||
| * the same weights as \w \d \s or should all or some be | ||
| * in the 'abcfnrtvx' below? */ | ||
| } else if (seen[(U8)'\''] || seen[(U8)'"']) { | ||
| weight += 1; | ||
| /* khw: This is problematic. Enough so, that I misread | ||
| * it, and added a wrong comment about what it does in | ||
| * 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee. Note that it | ||
| * doesn't look at the current character. What it | ||
| * actually does is: if any quote has been seen in the | ||
| * parse, don't do the rest of the else's below, but for | ||
| * every subsequent backslashed character encountered | ||
| * (except \0 \w \s \d), increment the weight to lean a | ||
| * bit more towards being a charclass. That means that | ||
| * every backslash sequence following the first occurrence | ||
| * of a quote increments the weight regardless of what the | ||
| * sequence is. Again, \0 \w \d and \s are not controlled | ||
| * by this else, so they change the weight by a lot more. | ||
| * But what makes them so special that they aren't subject | ||
| * to this. Any why does having a quote change the | ||
| * behavior from then on. And why only backslashed | ||
| * sequences get this treatment? This code has been | ||
| * unchanged since this function was added in 1993. I | ||
| * don't get it. Instead, it does seem to me that it is | ||
| * especially unlikely to repeat a quote in a charclass, | ||
| * but that having just a single quote is indicative of a | ||
| * charclass, and having pairs of quotes is indicative of | ||
| * a subscript. Similarly for things that could indicate | ||
| * nesting of braces or parens. */ | ||
| } | ||
| else if (memCHRs("abcfnrtvx", s[1])) | ||
| weight += 40; /* \n, etc => charclass */ | ||
| /* khw: Why not \e etc as well? */ | ||
| else if (isDIGIT(s[1])) { | ||
| weight += 40; /* \123 => charclass */ | ||
| while (s[1] && isDIGIT(s[1])) | ||
| s++; | ||
| } | ||
|  | ||
| /* khw: There are lots more possible escape sequences. Some, | ||
| * like \A,\z have no special meaning to charclasses, so might | ||
| * indicate a subscript, but I don't know what they would be | ||
| * doing there either. Some have been added to the language | ||
| * after this code was written, but no one thought to, or | ||
| * could wade through this function, to add them. Things like | ||
| * \p{} for properties, \N and \N{}, for example. | ||
| * | ||
| * It's problematic that \a is treated as plain 'a' for | ||
| * purposes of the 'seen' array. Whatever is matched by these | ||
| * backslashed sequences should not be added to 'seen'. That | ||
| * includes the backslash. */ | ||
| } | ||
| else /* \ followed by NUL strongly indicates character class */ | ||
| if (s[1] == '\0') { | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Personally: great change, maybe even add  Curious question: common pattern is condition  | ||
| /* \ followed by NUL strongly indicates character class */ | ||
| weight += 100; | ||
| } | ||
| else if (memCHRs("wds]", s[1])) { | ||
| weight += 100; /* \w \d \s => strongly charclass */ | ||
| /* khw: \] can't happen, as any ']' is beyond our search. Why | ||
| * not \W \D \S \h \v, etc as well? Should they have the same | ||
| * weights as \w \d \s or should all or some be in the | ||
| * 'abcfnrtvx' below? */ | ||
| } | ||
| else if (seen[(U8)'\''] || seen[(U8)'"']) { | ||
| weight += 1; | ||
| /* khw: This is problematic. Enough so, that I misread it, | ||
| * and added a wrong comment about what it does in | ||
| * 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee. Note that it | ||
| * doesn't look at the current character. What it actually | ||
| * does is: if any quote has been seen in the parse, don't do | ||
| * the rest of the else's below, but for every subsequent | ||
| * backslashed character encountered (except \0 \w \s \d), | ||
| * increment the weight to lean a bit more towards being a | ||
| * charclass. That means that every backslash sequence | ||
| * following the first occurrence of a quote increments the | ||
| * weight regardless of what the sequence is. Again, \0 \w \d | ||
| * and \s are not controlled by this else, so they change the | ||
| * weight by a lot more. But what makes them so special that | ||
| * they aren't subject to this. Any why does having a quote | ||
| * change the behavior from then on. And why only backslashed | ||
| * sequences get this treatment? This code has been unchanged | ||
| * since this function was added in 1993. I don't get it. | ||
| * Instead, it does seem to me that it is especially unlikely | ||
| * to repeat a quote in a charclass, but that having just a | ||
| * single quote is indicative of a charclass, and having pairs | ||
| * of quotes is indicative of a subscript. Similarly for | ||
| * things that could indicate nesting of braces or parens. */ | ||
| } | ||
| else if (memCHRs("abcfnrtvx", s[1])) | ||
| weight += 40; /* \n, etc => charclass */ | ||
| /* khw: Why not \e etc as well? */ | ||
| else if (isDIGIT(s[1])) { | ||
| weight += 40; /* \123 => charclass */ | ||
| while (s[1] && isDIGIT(s[1])) | ||
| s++; | ||
| } | ||
|  | ||
| /* khw: There are lots more possible escape sequences. Some, like | ||
| * \A,\z have no special meaning to charclasses, so might indicate | ||
| * a subscript, but I don't know what they would be doing there | ||
| * either. Some have been added to the language after this code | ||
| * was written, but no one thought to, or could wade through this | ||
| * function, to add them. Things like \p{} for properties, \N and | ||
| * \N{}, for example. | ||
| * | ||
| * It's problematic that \a is treated as plain 'a' for purposes | ||
| * of the 'seen' array. Whatever is matched by these backslashed | ||
| * sequences should not be added to 'seen'. That includes the | ||
| * backslash. */ | ||
| break; | ||
|  | ||
| case '-': | ||
|  | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great change. Former pattern appears to be used in other places, marking such cleanup as my next TODO once my current PR is merged.