Previous Up Next

2.3  Module Analyze implementation

2.3.1  Type definitions


open Stdpp
open Token
open Grammar


type header = {hd_contenu : string;
               hd_description : string;
               hd_add : float}
type href = {href_url : string;
             href_contenu : string;
             href_description : string;
             href_icon : string;
             href_add : float}
type liste = {l_header : header; l_refs : ligne list}
and ligne = Href of href | Liste of liste | Separator | Filtered_out
type htmldoc = {meta: string list; titre : string; bmk : ligne list; footer : string}



2.3.2  Lexer

It is largely inspired fron file plexer.ml of camlp4 distribution


(* The string buffering machinery *)
let buff = ref (String.create 80)
let store len x =
  if len >= String.length !buff then
    buff := !buff ^ String.create (String.length !buff);
  !buff.[len] <- x;
  succ len

let mstore len s =
  let rec add_rec len i =
    if i == String.length s then len else add_rec (store len s.[i]) (succ i)
  in
  add_rec len 0

let get_buff len = String.sub !buff 0 len

(* The lexer *)
let valch x = Char.code x - Char.code '0'

let rec ident len =
  parser
      [< '  ('!' | '#'..';' | '=' | '?'..'\255' as c); s >] ->
        ident (store len c) s
    | [< '  '>' >] -> store len '>'
    | [< >] -> len

let next_token_fun find_id_kwd find_spe_kwd =
  let err bp ep msg = raise_with_loc (make_loc (bp, ep)) (Token.Error msg) in
  let keyword_or_error (bp, ep) c s =
    let spe = get_buff (ident (store 0 c) s) in
    begin try ("", (find_spe_kwd spe)) with
        Not_found -> err bp ep ("illegal token: " ^ spe)
    end
  in
  let rec next_token =

    parser bp
        [< '  ('!' | '#'..';' | '=' | '?'..'\255' as c); s >] ->
          let id = get_buff (ident (store 0 c) s) in
            begin try ("", (find_id_kwd id)) with
                 Not_found -> ("LIDENT", id)
            end
      | [< '  '>' >] ->
          begin try ("", (find_id_kwd ">")) with
              Not_found -> ("LIDENT", ">")
          end
      | [< '  '"'; s >] -> ("STRING", (string bp 0 s))
  and string bp len =
    parser
        [< '  '"' >] -> get_buff len
      | [< 'c; s >] -> string bp (store len c) s
      | [< >] ep -> err bp ep "string not terminted"
  in
  let rec next_token_loc =
    parser bp
        [< '  ' ' | '\n' | '\r' | '\t' | '\026' | '\012'; s >] ->
          next_token_loc s
      | [< '  '<'; s >] -> maybe_comment bp s
      | [< tok = next_token >] ep -> (tok, (bp, ep))
      | [< _ = Stream.empty >] -> (("EOI", ""), (bp, succ bp))
  and maybe_comment bp =
    parser
        [< '  '!'; s >] -> comment bp s; next_token_loc s
      | [< s >] ep -> let tok = keyword_or_error (bp, ep) '<' s in
          (tok, (bp, ep))
  and comment bp =
    parser
        [< '  '>' >] -> ()
      | [< 'c; s >] -> comment bp s
      | [< >] ep -> err bp ep "comment not terminated"
  in
    fun cstrm ->
      try next_token_loc cstrm with
          Stream.Error str ->
            err (Stream.count cstrm) ((Stream.count cstrm)+1) str

let locerr () = invalid_arg "Lexer: location function"
let loct_create () = ref (Array.create 1024 None)
let loct_func loct i =
  match
    if i < 0 || i >= Array.length !loct then None
    else Array.unsafe_get !loct i
  with
    Some loc -> make_loc loc
  | _ -> locerr ()

let loct_add loct i loc =
  if i >= Array.length !loct then
    begin
      let new_tmax = Array.length !loct * 2 in
      let new_loct = Array.create new_tmax None in
      Array.blit !loct 0 new_loct 0 (Array.length !loct); loct := new_loct; ()
    end;
  !loct.(i) <- Some loc;
  ()

let func kwd_table =
  let find = Hashtbl.find kwd_table in
  let lex cstrm =
    let next_token_loc = next_token_fun find find in
    let loct = loct_create () in
    let ts =
      Stream.from
        (fun i ->
           let (tok, loc) = next_token_loc cstrm in
           loct_add loct i loc; Some tok)
    in
    let locf = loct_func loct in ts, locf
  in
  lex

let check_keyword kwd = true

let using_token kwd_table (p_con, p_prm) =
  match p_con with
    "" ->
      begin try let _ = Hashtbl.find kwd_table p_prm in () with
        Not_found ->
          if check_keyword p_prm then Hashtbl.add kwd_table p_prm p_prm
          else
            raise
              (Token.Error
                 ("the token \"" ^ p_prm ^
                    "\" does not respect Plexer rules"))
      end
  | "LIDENT" | "UIDENT" | "INT" | "FLOAT" | "CHAR" | "STRING" | "QUOTATION" |
    "ANTIQUOT" | "LOCATE" | "EOI" ->
      ()
  | _ ->
      raise
        (Token.Error
           ("the constructor \"" ^ p_con ^ "\" is not recognized by Plexer"))


let removing_token kwd_table (p_con, p_prm) =
  if p_con = "" then Hashtbl.remove kwd_table p_prm


let text =
  function
    "", t -> "'" ^ t ^ "'"
  | "LIDENT", "" -> "lowercase identifier"
  | "LIDENT", t -> "'" ^ t ^ "'"
  | "UIDENT", "" -> "uppercase identifier"
  | "UIDENT", t -> "'" ^ t ^ "'"
  | "INT", "" -> "integer"
  | "INT", s -> "'" ^ s ^ "'"
  | "FLOAT", "" -> "float"
  | "STRING", "" -> "string"
  | "CHAR", "" -> "char"
  | "QUOTATION", "" -> "quotation"
  | "ANTIQUOT", k -> "antiquot \"" ^ k ^ "\""
  | "LOCATE", "" -> "locate"
  | "EOI", "" -> "end of input"
  | con, "" -> con
  | con, prm -> con ^ " \"" ^ prm ^ "\""


let eq_before_colon p e =
  let rec loop i =
    if i == String.length e then
      failwith "Internal error in Plexer: incorrect ANTIQUOT"
    else if i == String.length p then e.[i] == ':'
    else if p.[i] == e.[i] then loop (i + 1)
    else false
  in
  loop 0


let after_colon e =
  try
    let i = String.index e ':' in
    String.sub e (i + 1) (String.length e - i - 1)
  with
    Not_found -> ""


let tparse =
  function
    "ANTIQUOT", p_prm ->
      (fun (strm__ : _ Stream.t) ->
         match Stream.peek strm__ with
           Some ("ANTIQUOT", prm) when eq_before_colon p_prm prm ->
             Stream.junk strm__; after_colon prm
         | _ -> raise Stream.Failure)
  | p_con, "" ->
      (fun (strm__ : _ Stream.t) ->
         match Stream.peek strm__ with
           Some (con, prm) when con = p_con -> Stream.junk strm__; prm
         | _ -> raise Stream.Failure)
  | p_con, p_prm ->
      fun (strm__ : _ Stream.t) ->
        match Stream.peek strm__ with
          Some (con, prm) when con = p_con && prm = p_prm ->
            Stream.junk strm__; prm
        | _ -> raise Stream.Failure


let tparse _ = None

let lexer =
  let kwd_table = Hashtbl.create 301 in
  {func = func kwd_table; using = using_token kwd_table;
   removing = removing_token kwd_table; tparse = tparse; text = text}



2.3.3  Grammar


let gram = Grammar.create lexer
let skip = Grammar.Entry.create gram "skip"
let meta = Grammar.Entry.create gram "meta"
let title = Grammar.Entry.create gram "title"
let header = Grammar.Entry.create gram "header"
let att = Grammar.Entry.create gram "att"
let att_name = Grammar.Entry.create gram "att_name"
let liste = Grammar.Entry.create gram "liste"
let ligne = Grammar.Entry.create gram "ligne"
let fin_ligne = Grammar.Entry.create gram "fin_ligne"
let document = Grammar.Entry.create gram "document"
let termes = Grammar.Entry.create gram "termes"
let terme = Grammar.Entry.create gram "terme"
let alias = Grammar.Entry.create gram "alias"

EXTEND
  document: [ [ LIST0 skip; m = LIST0 meta; t = title;
                "<DL>"; "<p>"; b = LIST0 ligne; "</DL>"; "<p>"; EOI
                  -> {meta=m; titre=t; bmk=b; footer=""} ] ];
  skip: [ [ "<BASE" | terme ] ];
  meta: [ [ "<META"; m = termes; ">" -> "<META "^m^">\n" ] ];
  title: [ [ "<TITLE>"; s = termes; "</TITLE>";
             "<H1>"; s' = termes; "</H1>"; OPT "<DD>" -> s |
             "<TITLE>"; s = termes; "</TITLE>";
             "<H1"; OPT "LAST_MODIFIED="; OPT STRING; ">"; s' = termes; "</H1>"; OPT "<DD>" -> s] ];
  termes: [ [ l = LIST0 terme -> String.concat " " l ] ];
  terme: [ [ s = LIDENT -> s
           | s = STRING -> s] ];
  liste: [ [ h = header; "<DL>"; "<p>"; l = LIST0 ligne; "</DL>"; "<p>" ->
             {l_header = h; l_refs = l} ] ];
  att_name: [ [ "FOLDED" -> ()
              | "NEWITEMHEADER" -> ()
              | "PERSONAL_TOOLBAR_FOLDER=" -> ()
              | "NEW_BOOKMARK_FOLDER=" -> ()
              | "ID=" -> ()
              | "SHORTCUTURL=" -> ()
              | "LAST_VISIT=" -> ()
              | "LAST_CHARSET=" -> () ] ];
  att: [ [ att_name; OPT STRING -> ()] ];
  header: [ [ "<H3"; LIST0 att;
              OPT "ADD_DATE="; add = OPT STRING;
              LIST0 att;
              OPT "LAST_MODIFIED="; lm = OPT STRING;
              LIST0 att;
              ">"; s = termes; "</H3>";
              OPT "<DD>"; d = termes ->
                {hd_contenu = s;
                 hd_description = d;
                 hd_add = match add,lm with
                   | Some n, Some n' -> float_of_string (max n n')
                   | Some n, None -> float_of_string n
                   | None, Some n -> float_of_string n
                   | None, None -> 0.}] ];
  ligne: [ [ "<DT>"; l = fin_ligne -> l
           | "<HR>" -> Separator ] ];
  fin_ligne: [ [ l = liste -> Liste l
           | "<A"; "HREF="; url = STRING;
             OPT alias;
             LIST0 att;
             OPT "ADD_DATE="; add = OPT STRING;
             LIST0 att;
             OPT "LAST_MODIFIED="; lm = OPT STRING;
             OPT "ICON="; icon = OPT STRING;
             LIST0 att; ">";
             s = termes; "</A>"; OPT "<DD>"; d = termes ->
               Href {href_url = url;
                     href_contenu = s;
                     href_description = d;
                     href_icon = (match icon with
                                  | None -> ""
                                  | Some s -> s);
                     href_add = match add,lm with
                       | Some n, Some n' -> float_of_string (max n n')
                       | Some n, None -> float_of_string n
                       | None, Some n -> float_of_string n
                       | None, None -> 0.}] ];
  alias: [ [ "ALIASID="; STRING -> ()
           | "ALIASOF="; STRING -> () ] ];
END

let analyze filename =
  let ch = open_in filename in
  try
    let d = Grammar.Entry.parse document (Stream.of_channel ch)
    in close_in ch; d
  with
      Stdpp.Exc_located (loc, e) ->
        let (realfile, line, bc, ec) = line_of_loc filename loc in
          Printf.printf "File %s, line %d, characters %d-%d\n"
            filename line bc ec;
          raise e



2.3.4  htmldoc manipulation

Html reconstruction in labels.


let rehtml ref =
  let rehtml' st =
    List.fold_left
      (fun s kw ->
         Str.global_replace
           (Str.regexp ("&lt;"^kw^"&gt;\\(.*\\)&lt;/"^kw^"&gt;"))
           ("<"^kw^">\\1</"^kw^">") s)
      st
      ["strong"; "em"; "code"; "b"; "i"; "tt"] in
    {href_url = ref.href_url;
     href_contenu = rehtml' ref.href_contenu;
     href_description = rehtml' ref.href_description;
     href_icon = ref.href_icon;
     href_add = ref.href_add}



Kewords out filtering.


module Keywords =
  Set.Make (struct
              type t = string
              let compare = compare
            end)

let remove_filtered_out l =
  List.fold_right
    (fun ligne clean_list -> match ligne with
         Filtered_out -> clean_list
       | _ -> ligne::clean_list)
    l
    []

let rec filtre_ligne kwds = function
    Liste {l_header = h; l_refs = r} ->
      if (Keywords.mem h.hd_contenu kwds) then Filtered_out
      else Liste {l_header = h;
                  l_refs = remove_filtered_out
                             (List.map (filtre_ligne kwds) r)}
  | Href h -> Href (rehtml h)
  | l -> l

let filter doc kwds =
  {meta = doc.meta;
    titre = doc.titre;
    bmk = remove_filtered_out
            (List.map (filtre_ligne kwds) doc.bmk);
    footer = doc.footer}



Top folder setting.


let rec top_ligne top r l =
  match r with
      [] ->
        begin match l with
            Liste {l_header = h; l_refs = re} ->
              if h.hd_contenu = top then re
              else List.fold_left (top_ligne top) [] re
          | _ -> []
        end
    | _ -> r

let new_top doc top =
  {meta = doc.meta;
    titre = doc.titre;
    bmk = (List.fold_left (top_ligne top) [] doc.bmk);
    footer = doc.footer}

   



Previous Up Next