⚠️ Warning: This is a draft ⚠️

This means it might contain formatting issues, incorrect code, conceptual problems, or other severe issues.

If you want to help to improve and eventually enable this page, please fork RosettaGit's repository and open a merge request on GitHub.

The content of this page is related to the main page [[Web Scraping#OCaml]]


let init_socket addr port =
  let inet_addr = (Unix.gethostbyname addr).Unix.h_addr_list.(0) in
  let sockaddr = Unix.ADDR_INET (inet_addr, port) in
  let suck = Unix.socket Unix.PF_INET Unix.SOCK_STREAM 0 in
  Unix.connect suck sockaddr;
  let outchan = Unix.out_channel_of_descr suck in
  let inchan = Unix.in_channel_of_descr suck in
  (inchan, outchan)
;;

let serialize ~post_data =
  String.concat "&"
    (List.map (fun (key, var) -> key ^ "=" ^ var) post_data)
;;

type request = GET | HEAD | POST of (string * string) list

let submit_request ~address ~port ~kind ~path ~referer ~user_agent =
  let req_tag, post_data =
    match kind with
    | GET -> "GET", None
    | HEAD -> "HEAD", None
    | POST data -> "POST", Some data
  in
  let request =
    (Printf.sprintf "%s %s HTTP/1.0\r\n" req_tag path) ^
    (Printf.sprintf "Host: %s\r\n" address) ^
    (match user_agent with None -> "" | Some ua -> Printf.sprintf "User-Agent: %s\r\n" ua) ^
    (match referer with None -> "" | Some referer -> Printf.sprintf "Referer: %s\r\n" referer) ^
    (match post_data with None -> ""
     | Some post_data -> let post_data = serialize ~post_data in
         "Content-type: application/x-www-form-urlencoded\r\n" ^
         "Content-length: "^ string_of_int(String.length post_data) ^"\r\n" ^
         "Connection: close\r\n" ^
         "\r\n" ^
         post_data
    ) ^
    ("\r\n")
  in
  let (inchan, outchan) = init_socket address port in
  output_string outchan request;
  flush outchan;
  (inchan, outchan)
;;

let strip_cr str =
  let len = String.length str in
  let striped = String.create len in
  let rec aux i j =
    if i >= len then j else begin
      if str.[i] <> '\r' then begin
        striped.[j] <- str.[i];
        aux (succ i) (succ j)
      end else begin
        aux (succ i) j
      end
    end
  in
  let nlen = aux 0 0 in
  (String.sub striped 0 nlen)
;;

let cont_of_inchan ?limit ic =
  let first_line = strip_cr(input_line ic) in
  let rec get_header acc =
    try
      let line = input_line ic in
      if line = "\r" || line = ""
      then acc
      else get_header(strip_cr line::acc)
    with End_of_file -> acc
  in
  let header = get_header []
  in
  let buf = Buffer.create 10240 in
  let tmp = String.make 1024 '\000' in
  let rec aux lim =
    let bytes = input ic tmp 0 (min lim 1024) in
    if bytes > 0 then begin
      Buffer.add_substring buf tmp 0 bytes;
      aux (lim - bytes)
    end
  in
  let rec aux_nolim() =
    let bytes = input ic tmp 0 1024 in
    if bytes > 0 then begin
      Buffer.add_substring buf tmp 0 bytes;
      aux_nolim()
    end
  in
  (try
     match limit with
     | Some lim -> aux lim
     | None -> aux_nolim()
   with End_of_file -> ());
  let page = Buffer.contents buf in
  (first_line, header, page)
;;

let cut_url ~url =
  let len = String.length url in
  let (address, len) =
    if len < 7 then (url, len) else
    begin
      let first_7 = String.sub url 0 7 in
      if first_7 = "http://"
      then (String.sub url 7 (len - 7), (len - 7))
      else (url, len)
    end
  in
  let (address, path) =
    try
      let pos = String.index address '/' in
      (String.sub address 0 pos,
       String.sub address (pos) (len - pos))
    with _ ->
      (address, "/")
  in
  (address, path)
;;

let make_request ~url ?(port=80) ?(kind=GET) ?referer ?user_agent () =
  let (address, path) = cut_url ~url in
  let (inchan, outchan) = submit_request ~address ~port ~kind ~path ~referer ~user_agent in
  let cont = cont_of_inchan inchan in
  close_in inchan;
  (cont)
;;