1 (***********************************************************************)
5 (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
7 (* Copyright 1996 Institut National de Recherche en Informatique et *)
8 (* en Automatique. All rights reserved. This file is distributed *)
9 (* under the terms of the Q Public License version 1.0. *)
11 (***********************************************************************)
13 (* $Id: lexer.mll 7307 2006-01-04 16:55:50Z doligez $ *)
15 (* The lexical analyzer for lexer definitions. Bootstrapped! *)
21 (* Auxiliaries for the lexical analyzer *)
23 let brace_depth = ref 0
24 and comment_depth = ref 0
26 let in_pattern () = !brace_depth = 0 && !comment_depth = 0
28 exception Lexical_error of string * string * int * int
30 let string_buff = Buffer.create 256
32 let reset_string_buffer () = Buffer.clear string_buff
34 let store_string_char c = Buffer.add_char string_buff c
36 let get_stored_string () = Buffer.contents string_buff
38 let char_for_backslash = function
45 let raise_lexical_error lexbuf msg =
46 let p = Lexing.lexeme_start_p lexbuf in
47 raise (Lexical_error (msg,
50 p.Lexing.pos_cnum - p.Lexing.pos_bol + 1))
53 let handle_lexical_error fn lexbuf =
54 let p = Lexing.lexeme_start_p lexbuf in
55 let line = p.Lexing.pos_lnum
56 and column = p.Lexing.pos_cnum - p.Lexing.pos_bol + 1
57 and file = p.Lexing.pos_fname
61 with Lexical_error (msg, "", 0, 0) ->
62 raise(Lexical_error(msg, file, line, column))
64 let get_input_name () = Sys.argv.(Array.length Sys.argv - 1)
66 let warning lexbuf msg =
67 let p = Lexing.lexeme_start_p lexbuf in
68 Printf.eprintf "ocamllex warning:\nFile \"%s\", line %d, character %d: %s.\n"
69 p.Lexing.pos_fname p.Lexing.pos_lnum
70 (p.Lexing.pos_cnum - p.Lexing.pos_bol + 1) msg;
73 let decimal_code c d u =
74 100 * (Char.code c - 48) + 10 * (Char.code d - 48) + (Char.code u - 48)
76 let char_for_hexadecimal_code d u =
77 let d1 = Char.code d in
78 let val1 = if d1 >= 97 then d1 - 87
79 else if d1 >= 65 then d1 - 55
82 let d2 = Char.code u in
83 let val2 = if d2 >= 97 then d2 - 87
84 else if d2 >= 65 then d2 - 55
87 Char.chr (val1 * 16 + val2)
89 let incr_loc lexbuf delta =
90 let pos = lexbuf.Lexing.lex_curr_p in
91 lexbuf.Lexing.lex_curr_p <- { pos with
92 Lexing.pos_lnum = pos.Lexing.pos_lnum + 1;
93 Lexing.pos_bol = pos.Lexing.pos_cnum - delta;
97 let update_loc lexbuf opt_file line =
98 let pos = lexbuf.Lexing.lex_curr_p in
99 let new_file = match opt_file with
100 | None -> pos.Lexing.pos_fname
103 lexbuf.Lexing.lex_curr_p <- { pos with
104 Lexing.pos_fname = new_file;
105 Lexing.pos_lnum = line;
106 Lexing.pos_bol = pos.Lexing.pos_cnum;
113 ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255']
115 ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9']
116 let backslash_escapes =
117 ['\\' '"' '\'' 'n' 't' 'b' 'r']
120 [' ' '\013' '\009' '\012' ] +
125 | "#" [' ' '\t']* (['0'-'9']+ as num) [' ' '\t']*
126 ('\"' ([^ '\010' '\013' '\"']* as name) '\"')?
127 [^ '\010' '\013']* '\010'
128 { update_loc lexbuf name (int_of_string num);
132 { comment_depth := 1;
133 handle_lexical_error comment lexbuf;
135 | '_' { Tunderscore }
136 | identstart identbody *
137 { match Lexing.lexeme lexbuf with
140 | "shortest" -> Tparse_shortest
147 { reset_string_buffer();
148 handle_lexical_error string lexbuf;
149 Tstring(get_stored_string()) }
150 (* note: ''' is a valid character literal (by contrast with the compiler) *)
152 { Tchar(Char.code(Lexing.lexeme_char lexbuf 1)) }
153 | "'" '\\' backslash_escapes "'"
154 { Tchar(Char.code(char_for_backslash (Lexing.lexeme_char lexbuf 2))) }
155 | "'" '\\' (['0'-'9'] as c) (['0'-'9'] as d) (['0'-'9'] as u)"'"
156 { let v = decimal_code c d u in
158 raise_lexical_error lexbuf
159 (Printf.sprintf "illegal escape sequence \\%c%c%c" c d u)
163 (['0'-'9' 'a'-'f' 'A'-'F'] as d) (['0'-'9' 'a'-'f' 'A'-'F'] as u) "'"
164 { Tchar(Char.code(char_for_hexadecimal_code d u)) }
166 { raise_lexical_error lexbuf
167 (Printf.sprintf "illegal escape sequence \\%c" c)
170 { let p = Lexing.lexeme_end_p lexbuf in
171 let n1 = p.Lexing.pos_cnum
172 and l1 = p.Lexing.pos_lnum
173 and s1 = p.Lexing.pos_bol in
175 let n2 = handle_lexical_error action lexbuf in
176 Taction({start_pos = n1; end_pos = n2;
177 start_line = l1; start_col = n1 - s1}) }
192 { raise_lexical_error lexbuf
193 ("illegal character " ^ String.escaped(Lexing.lexeme lexbuf))
197 (* String parsing comes from the compiler lexer *)
201 | '\\' ("\010" | "\013" | "\013\010") ([' ' '\009'] * as spaces)
202 { incr_loc lexbuf (String.length spaces);
204 | '\\' (backslash_escapes as c)
205 { store_string_char(char_for_backslash c);
207 | '\\' (['0'-'9'] as c) (['0'-'9'] as d) (['0'-'9'] as u)
208 { let v = decimal_code c d u in
209 if in_pattern () && v > 255 then
212 "illegal backslash escape in string: `\\%c%c%c'" c d u) ;
213 store_string_char (Char.chr v);
215 | '\\' 'x' (['0'-'9' 'a'-'f' 'A'-'F'] as d) (['0'-'9' 'a'-'f' 'A'-'F'] as u)
216 { store_string_char (char_for_hexadecimal_code d u) ;
219 {if in_pattern () then
221 (Printf.sprintf "illegal backslash escape in string: `\\%c'" c) ;
222 store_string_char '\\' ;
223 store_string_char c ;
226 { raise(Lexical_error("unterminated string", "", 0, 0)) }
228 { store_string_char '\010';
232 { store_string_char c;
236 Lexers comment and action are quite similar,
237 they should lex both strings and characters,
238 in order not to be confused by what is inside then
243 { incr comment_depth; comment lexbuf }
245 { decr comment_depth;
246 if !comment_depth = 0 then () else comment lexbuf }
248 { reset_string_buffer();
250 reset_string_buffer();
256 { raise(Lexical_error("unterminated comment", "", 0, 0)) }
269 if !brace_depth = 0 then Lexing.lexeme_start lexbuf else action lexbuf }
271 { reset_string_buffer();
272 handle_lexical_error string lexbuf;
273 reset_string_buffer();
279 { comment_depth := 1;
283 { raise (Lexical_error("unterminated action", "", 0, 0)) }
290 and skip_char = parse
294 | [^ '\\' '\''] "'" (* regular character *)
295 (* one character and numeric escape sequences *)
297 | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
298 | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F'] "'"