⚠️ Warning: This is a draft ⚠️
This means it might contain formatting issues, incorrect code, conceptual problems, or other severe issues.
If you want to help to improve and eventually enable this page, please fork RosettaGit's repository and open a merge request on GitHub.
The content of this page is related to the main page [[Web Scraping#OCaml]]
let init_socket addr port =
let inet_addr = (Unix.gethostbyname addr).Unix.h_addr_list.(0) in
let sockaddr = Unix.ADDR_INET (inet_addr, port) in
let suck = Unix.socket Unix.PF_INET Unix.SOCK_STREAM 0 in
Unix.connect suck sockaddr;
let outchan = Unix.out_channel_of_descr suck in
let inchan = Unix.in_channel_of_descr suck in
(inchan, outchan)
;;
let serialize ~post_data =
String.concat "&"
(List.map (fun (key, var) -> key ^ "=" ^ var) post_data)
;;
type request = GET | HEAD | POST of (string * string) list
let submit_request ~address ~port ~kind ~path ~referer ~user_agent =
let req_tag, post_data =
match kind with
| GET -> "GET", None
| HEAD -> "HEAD", None
| POST data -> "POST", Some data
in
let request =
(Printf.sprintf "%s %s HTTP/1.0\r\n" req_tag path) ^
(Printf.sprintf "Host: %s\r\n" address) ^
(match user_agent with None -> "" | Some ua -> Printf.sprintf "User-Agent: %s\r\n" ua) ^
(match referer with None -> "" | Some referer -> Printf.sprintf "Referer: %s\r\n" referer) ^
(match post_data with None -> ""
| Some post_data -> let post_data = serialize ~post_data in
"Content-type: application/x-www-form-urlencoded\r\n" ^
"Content-length: "^ string_of_int(String.length post_data) ^"\r\n" ^
"Connection: close\r\n" ^
"\r\n" ^
post_data
) ^
("\r\n")
in
let (inchan, outchan) = init_socket address port in
output_string outchan request;
flush outchan;
(inchan, outchan)
;;
let strip_cr str =
let len = String.length str in
let striped = String.create len in
let rec aux i j =
if i >= len then j else begin
if str.[i] <> '\r' then begin
striped.[j] <- str.[i];
aux (succ i) (succ j)
end else begin
aux (succ i) j
end
end
in
let nlen = aux 0 0 in
(String.sub striped 0 nlen)
;;
let cont_of_inchan ?limit ic =
let first_line = strip_cr(input_line ic) in
let rec get_header acc =
try
let line = input_line ic in
if line = "\r" || line = ""
then acc
else get_header(strip_cr line::acc)
with End_of_file -> acc
in
let header = get_header []
in
let buf = Buffer.create 10240 in
let tmp = String.make 1024 '\000' in
let rec aux lim =
let bytes = input ic tmp 0 (min lim 1024) in
if bytes > 0 then begin
Buffer.add_substring buf tmp 0 bytes;
aux (lim - bytes)
end
in
let rec aux_nolim() =
let bytes = input ic tmp 0 1024 in
if bytes > 0 then begin
Buffer.add_substring buf tmp 0 bytes;
aux_nolim()
end
in
(try
match limit with
| Some lim -> aux lim
| None -> aux_nolim()
with End_of_file -> ());
let page = Buffer.contents buf in
(first_line, header, page)
;;
let cut_url ~url =
let len = String.length url in
let (address, len) =
if len < 7 then (url, len) else
begin
let first_7 = String.sub url 0 7 in
if first_7 = "http://"
then (String.sub url 7 (len - 7), (len - 7))
else (url, len)
end
in
let (address, path) =
try
let pos = String.index address '/' in
(String.sub address 0 pos,
String.sub address (pos) (len - pos))
with _ ->
(address, "/")
in
(address, path)
;;
let make_request ~url ?(port=80) ?(kind=GET) ?referer ?user_agent () =
let (address, path) = cut_url ~url in
let (inchan, outchan) = submit_request ~address ~port ~kind ~path ~referer ~user_agent in
let cont = cont_of_inchan inchan in
close_in inchan;
(cont)
;;