More Related Content Similar to Hacking parse.y (RubyKansai38) Similar to Hacking parse.y (RubyKansai38) (20) Hacking parse.y (RubyKansai38)1. Hacking parse.y
Tatsuhiro UJIHISA
ujihisa@gmail.com
http://ujihisa.blogspot.com/
@ujm
4. Hacking parse.y
Fixing ruby parser to understand ruby
• Introducing new syntax
• {:key :-) "value"}
• 'symbol
• ++i
• def A#b(c)
• {1}
5. MRI Inside
• MRI (Matz Ruby Implementation)
• $ ruby -v
ruby 1.9.2dev (2009-08-05 trunk 24397) [i386-darwin9.7.0]
• Written in C
• array.c, vm.c, gc.c, etc...
6. ruby 1.8 vs 1.9
• ~1.8
• Parser: parse.y
• Evaluator: eval.c
• 1.9~
• Parser: parse.y
• Evaluator:YARV (vm*.c)
7. Matz said
• Ugly: eval.c and parse.y
RubyConf2006
• Now the original evaluator
was all replaced with YARV
8. MRI Parser
• MRI uses yacc
(parser generator for C)
• parse.y-o y.tab.c parse.y
bison -d
sed -f ./tool/ytab.sed -e "/^#/s!y.tab.c!
parse.c!" y.tab.c > parse.c.new
...
9. parse.y
• One of the darkest side
• $ wc -l *{c,h,y} | sort -n
...
9261 io.c
10350 parse.y
16352 parse.c # (automatically generated)
183370 total
11. Tokens in Lexer
%token <id> tOP_ASGN /* +=, -= et
%token tUPLUS /* unary+ */ %token tASSOC /* => */
%token tUMINUS /* unary- */ %token tLPAREN /* ( */
%token tPOW /* ** */ %token tLPAREN_ARG /* ( */
%token tCMP /* <=> */ %token tRPAREN /* ) */
%token tEQ /* == */ %token tLBRACK /* [ */
%token tEQQ /* === */ %token tLBRACE /* { */
%token tNEQ /* != */ %token tLBRACE_ARG /* { */
%token tGEQ /* >= */ %token tSTAR /* * */
%token tLEQ /* <= */ %token tAMPER /* & */
%token tANDOP tOROP /* && and || */ %token tLAMBDA /* -> */
%token tMATCH tNMATCH/* =~ and !~ */ %token tSYMBEG tSTRING_BEG tXSTRING_
%token tDOT2 tDOT3 /* .. and ... */ tWORDS_BEG tQWORDS_BEG
%token tAREF tASET /* [] and []= */ %token tSTRING_DBEG tSTRING_DVAR tST
%token tLSHFT tRSHFT /* << and >> */
%token tCOLON2 /* :: */
%token tCOLON3 /* :: at EXPR_BEG */
12. (detour)
n MRI: parse.y (10350 lines)
n JRuby: src/org/jruby/parser/{DefaultRubyParser.y,
Ruby19Parser.y}
(1886, 2076 lines)
n Rubinius: lib/ruby_parser.y (1795 lines)
13. Case 1:
:-)
• Hash literal
{:key => 'value'}
{:key :-) 'value'}
• :-) is just an alias of =>
15. Colons in Ruby
• A::B, ::C
• :symbol, :"sy-m-bol"
•a ? b : c
• {a: b}
• when 1: something (in 1.8)
17. How does parser deal
with colon?
• :: → tCOLON2 or tCOLON3
• tCOLON2 Net::URI
• tCOLON3 ::Kernel
18. lex_state
enum lex_state_e {
EXPR_BEG, /* ignore newline, +/- is a sign. */
EXPR_END, /* newline significant, +/- is an operator. *
EXPR_ENDARG, /* ditto, and unbound braces. */
EXPR_ARG, /* newline significant, +/- is an operator. *
EXPR_CMDARG, /* newline significant, +/- is an operator. *
EXPR_MID, /* newline significant, +/- is an operator. *
EXPR_FNAME, /* ignore newline, no reserved words. */
EXPR_DOT, /* right after `.' or `::', no reserved words
EXPR_CLASS, /* immediate after `class', no here document.
EXPR_VALUE /* alike EXPR_BEG but label is disallowed. */
};
19. case ':':
c = nextc();
if (c == ':') {
if (IS_BEG() ||
lex_state == EXPR_CLASS ||
(IS_ARG() && space_seen)) {
lex_state = EXPR_BEG;
return tCOLON3;
}
lex_state = EXPR_DOT;
return tCOLON2;
}
20. ...
if (lex_state == EXPR_END ||
lex_state == EXPR_ENDARG ||
(c != -1 && ISSPACE(c))) {
pushback(c);
lex_state = EXPR_BEG;
return ':';
}
switch (c) {
case ''':
lex_strterm = NEW_STRTERM(str_ssym, c, 0);
break;
case '"':
lex_strterm = NEW_STRTERM(str_dsym, c, 0);
break;
default:
pushback(c);
break;
}
lex_state = EXPR_FNAME;
return tSYMBEG;
21. How does parser deal
with colon? (summary)
• :: → tCOLON2 or tCOLON2
• EXPR_END or →: (else)
• otherwise → tSYMBEG
• :' → str_ssym
• :" → str_dsym
22. So,
• :-) → tASSOC
• :: → tCOLON2 or tCOLON2
• EXPR_END or →: (else)
• otherwise → tSYMBEG
• :' → str_ssym
• :" → str_dsym
24. Case 2:
Lisp Like Symbol
• Symbol Literal
:vancouver
'vancouver
• Ad-hoc
p :a, :b
p 'a, 'b
29. Lexer
@@ -685,6 +685,7 @@ static void
token_info_pop(struct parser_params*, const
char *token);
%type <val> program reswords then do
dot_or_colon
%*/
%token tUPLUS /* unary+ */
+%token tINCR /* ++var */
%token tUMINUS /* unary- */
%token tPOW /* ** */
%token tCMP /* <=> */
(Actually there are more trivial fixes)
31. parser example
variable : tIDENTIFIER
| tIVAR
| tGVAR
| tCONSTANT
| tCVAR
| keyword_nil {ifndef_ripper($$ = keyword_nil);}
| keyword_self {ifndef_ripper($$ = keyword_self);}
| keyword_true {ifndef_ripper($$ = keyword_true);}
| keyword_false {ifndef_ripper($$ = keyword_false);}
| keyword__FILE__ {ifndef_ripper($$ = keyword__FILE__);}
| keyword__LINE__ {ifndef_ripper($$ = keyword__LINE__);}
| keyword__ENCODING__ {ifndef_ripper($$ = keyword__ENCODING_
;
32. lhs : variable
{
/*%%%*/
if (!($$ = assignable($1, 0))) $$ = NEW_BEGIN(0);
/*%
$$ = dispatch1(var_field, $1);
%*/
}
| primary_value '[' opt_call_args rbracket
{
/*%%%*/
$$ = aryset($1, $3);
/*%
$$ = dispatch2(aref_field, $1, escape_Qundef($3));
%*/
}
...
33. BNF (part)
program : compstmt arg : lhs '=' arg
| var_lhs tOP_ASGN arg
compstmt : stmts opt_terms | primary_value '[' aref_args ']' tOP
stmts : none
| stmt | arg '?' arg ':' arg
| stmts terms stmt | primary
stmt : kALIAS fitem fitem primary : literal
| kALIAS tGVAR tGVAR | strings
| expr | tLPAREN_ARG expr ')'
| tLPAREN compstmt ')'
expr : kRETURN call_args
| kBREAK call_args
| kREDO
| kRETRY
| '!' command_call
| arg
34. Assign
stmt : ...
| mlhs '=' command_call
{
/*%%%*/
value_expr($3);
$1->nd_value = $3;
$$ = $1;
/*%
$$ = dispatch2(massign, $1, $3);
%*/
}
35. mlhs
mlhs: mlhs_basic | ...
mlhs_basic: mlhs_head | ...
mlhs_head: mlhs_item ',' | ...
mlhs_item: mlhs_node | ...
mlhs_node: variable {
$$ = assignable($1, 0); }
36. Method call
block_command : block_call
| block_call '.' operation2 command_args
{
/*%%%*/
$$ = NEW_CALL($1, $3, $4);
/*%
$$ = dispatch3(call, $1, ripper_id2sym('.'),
$$ = method_arg($$, $4);
%*/
}
37. Mix!
var_ref: ...
| tINCR variable
{
/*%%%*/
$$ = assignable($2, 0);
$$->nd_value = NEW_CALL(gettable($$->nd_vid),
rb_intern("succ"), 0);
/*%
$$ = dispatch2(unary, ripper_intern("++@"), $2);
%*/
}
39. Case 4:
def A#b
• A#b
instance method b of class A
• A.b
class method b of class A
42. #
(in parser_yylex)
case '#': /* it's a comment */
/* no magic_comment in shebang line */
if (!parser_magic_comment(parser, lex_p, lex_pend - lex_p)) {
if (comment_at_top(parser)) {
set_file_encoding(parser, lex_p, lex_pend);
}
}
lex_p = lex_pend;
43. #
(in parser_yylex)
case '#': /* it's a comment */
c = nextc();
pushback(c);
if(lex_state == EXPR_END && ISALNUM(c)) return '#';
/* no magic_comment in shebang line */
if (!parser_magic_comment(parser, lex_p, lex_pend - lex_p)) {
if (comment_at_top(parser)) {
set_file_encoding(parser, lex_p, lex_pend);
44. Primary
primary: literal | ...
| k_def singleton dot_or_colon {lex_state = EXPR_FNAME;} fname
{
in_single++;
lex_state = EXPR_END; /* force for args */
/*%%%*/
local_push(0);
/*%
%*/
}
f_arglist
bodystmt
k_end
{
/*%%%*/
NODE *body = remove_begin($8);
reduce_nodes(&body);
$$ = NEW_DEFS($2, $5, $7, body);
fixpos($$, $2);
local_pop();
/*%
$$ = dispatch5(defs, $2, $3, $5, $7, $8);
%*/
in_single--;
}
45. | k_def cname '#' {lex_state = EXPR_FNAME;} fname
{
$<id>$ = cur_mid;
cur_mid = $5;
in_def++;
/*%%%*/
local_push(0);
/*%
%*/
}
f_arglist
bodystmt
k_end
{
/*%%%*/
NODE *body = remove_begin($8);
reduce_nodes(&body);
$$ = NEW_DEFN($5, $7, body, NOEX_PRIVATE);
fixpos($$, $7);
fixpos($$->nd_defn, $7);
$$ = NEW_CLASS(NEW_COLON3($2), $$, 0);
nd_set_line($$, $<num>6);
local_pop();
/*%
$$ = dispatch4(defi, $2, $5, $7, $8);
%*/
in_def--;
cur_mid = $<id>6;
}