⚠️ Warning: This is a draft ⚠️
This means it might contain formatting issues, incorrect code, conceptual problems, or other severe issues.
If you want to help to improve and eventually enable this page, please fork RosettaGit's repository and open a merge request on GitHub.
The content of this page is related to the main page [[Web Scraping#OCaml]]
let init_socket addr port = let inet_addr = (Unix.gethostbyname addr).Unix.h_addr_list.(0) in let sockaddr = Unix.ADDR_INET (inet_addr, port) in let suck = Unix.socket Unix.PF_INET Unix.SOCK_STREAM 0 in Unix.connect suck sockaddr; let outchan = Unix.out_channel_of_descr suck in let inchan = Unix.in_channel_of_descr suck in (inchan, outchan) ;; let serialize ~post_data = String.concat "&" (List.map (fun (key, var) -> key ^ "=" ^ var) post_data) ;; type request = GET | HEAD | POST of (string * string) list let submit_request ~address ~port ~kind ~path ~referer ~user_agent = let req_tag, post_data = match kind with | GET -> "GET", None | HEAD -> "HEAD", None | POST data -> "POST", Some data in let request = (Printf.sprintf "%s %s HTTP/1.0\r\n" req_tag path) ^ (Printf.sprintf "Host: %s\r\n" address) ^ (match user_agent with None -> "" | Some ua -> Printf.sprintf "User-Agent: %s\r\n" ua) ^ (match referer with None -> "" | Some referer -> Printf.sprintf "Referer: %s\r\n" referer) ^ (match post_data with None -> "" | Some post_data -> let post_data = serialize ~post_data in "Content-type: application/x-www-form-urlencoded\r\n" ^ "Content-length: "^ string_of_int(String.length post_data) ^"\r\n" ^ "Connection: close\r\n" ^ "\r\n" ^ post_data ) ^ ("\r\n") in let (inchan, outchan) = init_socket address port in output_string outchan request; flush outchan; (inchan, outchan) ;; let strip_cr str = let len = String.length str in let striped = String.create len in let rec aux i j = if i >= len then j else begin if str.[i] <> '\r' then begin striped.[j] <- str.[i]; aux (succ i) (succ j) end else begin aux (succ i) j end end in let nlen = aux 0 0 in (String.sub striped 0 nlen) ;; let cont_of_inchan ?limit ic = let first_line = strip_cr(input_line ic) in let rec get_header acc = try let line = input_line ic in if line = "\r" || line = "" then acc else get_header(strip_cr line::acc) with End_of_file -> acc in let header = get_header [] in let buf = Buffer.create 10240 in let tmp = String.make 1024 '\000' in let rec aux lim = let bytes = input ic tmp 0 (min lim 1024) in if bytes > 0 then begin Buffer.add_substring buf tmp 0 bytes; aux (lim - bytes) end in let rec aux_nolim() = let bytes = input ic tmp 0 1024 in if bytes > 0 then begin Buffer.add_substring buf tmp 0 bytes; aux_nolim() end in (try match limit with | Some lim -> aux lim | None -> aux_nolim() with End_of_file -> ()); let page = Buffer.contents buf in (first_line, header, page) ;; let cut_url ~url = let len = String.length url in let (address, len) = if len < 7 then (url, len) else begin let first_7 = String.sub url 0 7 in if first_7 = "http://" then (String.sub url 7 (len - 7), (len - 7)) else (url, len) end in let (address, path) = try let pos = String.index address '/' in (String.sub address 0 pos, String.sub address (pos) (len - pos)) with _ -> (address, "/") in (address, path) ;; let make_request ~url ?(port=80) ?(kind=GET) ?referer ?user_agent () = let (address, path) = cut_url ~url in let (inchan, outchan) = submit_request ~address ~port ~kind ~path ~referer ~user_agent in let cont = cont_of_inchan inchan in close_in inchan; (cont) ;;