⚠️ Warning: This is a draft ⚠️
This means it might contain formatting issues, incorrect code, conceptual problems, or other severe issues.
If you want to help to improve and eventually enable this page, please fork RosettaGit's repository and open a merge request on GitHub.
{{task|Programming environment operations}}[[Category:Networking and Web Interaction]]
Create a class for searching Yahoo! results.
It must implement a '''Next Page''' method, and read URL, Title and Content from results.
AutoHotkey
translated from python example
test:
yahooSearch("test", 1)
yahooSearch("test", 2)
return
yahooSearch(query, page)
{
global
start := ((page - 1) * 10) + 1
filedelete, search.txt
urldownloadtofile, % "http://search.yahoo.com/search?p=" . query
. "&b=" . start, search.txt
fileread, content, search.txt
reg = <a class="yschttl spt" href=".+?" >(.+?)</a></h3></div><div class="abstr">(.+?)</div><span class=url>(.+?)</span>
index := found := 1
while (found := regexmatch(content, reg, self, found + 1))
{
msgbox % title%A_Index% := fix(self1)
content%A_Index% := fix(self2)
url%A_Index% := fix(self3)
}
}
fix(url)
{
if pos := instr(url, "</a></h3></div>")
StringLeft, url, url, pos - 1
url := regexreplace(url, "<.*?>")
return url
}
C#
Generally it is not a good idea to scrape web pages. E. g. all implementations for this task which regex for "<a class=" fail by now, after Yahoo has changed its output format.
using System; using System.Net; using System.Text.RegularExpressions; using System.Collections.Generic; class YahooSearch { private string query; private string content; private int page; const string yahoo = "http://search.yahoo.com/search?"; public YahooSearch(string query) : this(query, 0) { } public YahooSearch(string query, int page) { this.query = query; this.page = page; this.content = new WebClient() .DownloadString( string.Format(yahoo + "p={0}&b={1}", query, this.page * 10 + 1) ); } public YahooResult[] Results { get { List<YahooResult> results = new List<YahooResult>(); Func<string, string, string> substringBefore = (str, before) => { int iHref = str.IndexOf(before); return iHref < 0 ? "" : str.Substring(0, iHref); }; Func<string, string, string> substringAfter = (str, after) => { int iHref = str.IndexOf(after); return iHref < 0 ? "" : str.Substring(iHref + after.Length); }; Converter<string, string> getText = p => Regex.Replace(p, "<[^>]*>", x => ""); Regex rx = new Regex(@" <li> <div \s class=""res""> <div> <h3> <a \s (?'LinkAttributes'[^>]+)> (?'LinkText' .*?) (?></a>) </h3> </div> <div \s class=""abstr""> (?'Abstract' .*?) (?></div>) .*? (?></div>) </li>", RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture ); foreach (Match e in rx.Matches(this.content)) { string rurl = getText(substringBefore(substringAfter( e.Groups["LinkAttributes"].Value, @"href="""), @"""")); string rtitle = getText(e.Groups["LinkText"].Value); string rcontent = getText(e.Groups["Abstract"].Value); results.Add(new YahooResult(rurl, rtitle, rcontent)); } return results.ToArray(); } } public YahooSearch NextPage() { return new YahooSearch(this.query, this.page + 1); } public YahooSearch GetPage(int page) { return new YahooSearch(this.query, page); } } class YahooResult { public string URL { get; set; } public string Title { get; set; } public string Content { get; set; } public YahooResult(string url, string title, string content) { this.URL = url; this.Title = title; this.Content = content; } public override string ToString() { return string.Format("\nTitle: {0}\nLink: {1}\nText: {2}", Title, URL, Content); } } // Usage: class Prog { static void Main() { foreach (int page in new[] { 0, 1 }) { YahooSearch x = new YahooSearch("test", page); foreach (YahooResult result in x.Results) { Console.WriteLine(result); } } } }
D
{{trans|C#}}
import std.stdio, std.exception, std.regex, std.algorithm, std.string, std.net.curl; struct YahooResult { string url, title, content; string toString() const { return "\nTitle: %s\nLink: %s\nText: %s" .format(title, url, content); } } struct YahooSearch { private string query, content; private uint page; this(in string query_, in uint page_ = 0) { this.query = query_; this.page = page_; this.content = "http://search.yahoo.com/search?p=%s&b=%d" .format(query, page * 10 + 1).get.assumeUnique; } @property results() const { immutable re = `<li> <div \s class="res"> <div> <h3> <a \s (?P<linkAttributes> [^>]+)> (?P<linkText> .*?) </a> </h3> </div> <div \s class="abstr"> (?P<abstract> .*?) </div> .*? </div> </li>`; const clean = (string s) => s.replace("<[^>]*>".regex("g"),""); return content.match(re.regex("gx")).map!(m => YahooResult( clean(m.captures["linkAttributes"] .findSplitAfter(`href="`)[1] .findSplitBefore(`"`)[0]), clean(m.captures["linkText"]), clean(m.captures["abstract"]) )); } YahooSearch nextPage() const { return YahooSearch(query, page + 1); } } void main() { writefln("%(%s\n%)", "test".YahooSearch.results); }
{{out|Output (shortened)}}
Title: Test.com
Link: http://www.test.com/
Text: Test.com provides a complete software solution for creating online tests and managing enterprise and specialist certification programs, in up to 22 languages.
Title: Speakeasy Speed Test
Link: http://www.speakeasy.net/speedtest/
Text: Test your Internet Connection with Speakeasy's reliable and accurate broadband speed test. What's your speed?
Title: Test | Define Test at Dictionary.com
Link: http://dictionary.reference.com/browse/test
Text: noun 1. the means by which the presence, quality, or genuineness of anything is determined; a means of trial. 2. the trial of the quality of something: to put to the ...
Gambas
Public Sub Form_Open()
Dim hWebView As WebView
Me.Arrangement = Arrange.Fill
Me.Maximized = True
Me.Title = "Yahoo! search interface"
hWebView = New WebView(Me)
hWebView.Expand = True
hWebView.URL = "https://www.yahoo.com"
End
'''[http://www.cogier.com/gambas/Yahoo!%20search%20interface.png Click here to see output (I have typed 'rosettacode' in the search box)]'''
GUISS
Start,Programs,Applications,Mozilla Firefox,Inputbox:address bar>www.yahoo.co.uk,
Button:Go,Area:browser window,Inputbox:searchbox>elephants,Button:Search
Haskell
Haskell is not an object oriented language, so this example does not implement an object class. However, it can be interesting as an example of how HTML source code can be parsed using the Parsec library.
import Network.HTTP import Text.Parsec data YahooSearchItem = YahooSearchItem { itemUrl, itemTitle, itemContent :: String } data YahooSearch = YahooSearch { searchQuery :: String, searchPage :: Int, searchItems :: [YahooSearchItem] } -- URL for Yahoo! searches, without giving a page number yahooUrl = "http://search.yahoo.com/search?p=" -- make an HTTP request and return a YahooSearch yahoo :: String -> IO YahooSearch yahoo q = simpleHTTP (getRequest $ yahooUrl ++ q) >>= getResponseBody >>= return . YahooSearch q 1 . items -- get some results and return the next page of results next :: YahooSearch -> IO YahooSearch next (YahooSearch q p _) = simpleHTTP (getRequest $ -- add the page number to the search yahooUrl ++ q ++ "&b=" ++ show (p + 1)) >>= getResponseBody >>= return . YahooSearch q (p + 1) . items printResults :: YahooSearch -> IO () printResults (YahooSearch q p items) = do putStrLn $ "Showing Yahoo! search results for query: " ++ q putStrLn $ "Page: " ++ show p putChar '\n' mapM_ printOne items where printOne (YahooSearchItem itemUrl itemTitle itemContent) = do putStrLn $ "URL : " ++ itemUrl putStrLn $ "Title : " ++ itemTitle putStrLn $ "Abstr : " ++ itemContent putChar '\n' urlTag, titleTag, contentTag1, contentTag2, ignoreTag, ignoreText :: Parsec String () String -- parse a tag containing the URL of a search result urlTag = do { string "<a id=\"link-"; many digit; string "\" class=\"yschttl spt\" href=\""; url <- manyTill anyChar (char '"'); manyTill anyChar (char '>'); return url } -- the title comes after the URL tag, so parse it first, discard it -- and get the title text titleTag = do { urlTag; manyTill anyChar (try (string "</a>")) } -- parse a tag containing the description of the search result -- the tag can be named "sm-abs" or "abstr" contentTag1 = do { string "<div class=\"sm-abs\">"; manyTill anyChar (try (string "</div>")) } contentTag2 = do { string "<div class=\"abstr\">"; manyTill anyChar (try (string "</div>")) } -- parse a tag and discard it ignoreTag = do { char ('<'); manyTill anyChar (char '>'); return "" } -- parse some text and discard it ignoreText = do { many1 (noneOf "<"); return "" } -- return only non-empty strings nonempty :: [String] -> Parsec String () [String] nonempty xs = return [ x | x <- xs, not (null x) ] -- a template to parse a whole source file looking for items of the -- same class parseCategory x = do res <- many x eof nonempty res urls, titles, contents :: Parsec String () [String] -- parse HTML source looking for URL tags of the search results urls = parseCategory url where url = (try urlTag) <|> ignoreTag <|> ignoreText -- parse HTML source looking for titles of the search results titles = parseCategory title where title = (try titleTag) <|> ignoreTag <|> ignoreText -- parse HTML source looking for descriptions of the search results contents = parseCategory content where content = (try contentTag1) <|> (try contentTag2) <|> ignoreTag <|> ignoreText -- parse the HTML source three times looking for URL, title and -- description of all search results and return them as a list of -- YahooSearchItem items :: String -> [YahooSearchItem] items q = let ignoreOrKeep = either (const []) id us = ignoreOrKeep $ parse urls "" q ts = ignoreOrKeep $ parse titles "" q cs = ignoreOrKeep $ parse contents "" q in [ YahooSearchItem { itemUrl = u, itemTitle = t, itemContent = c } | (u, t, c) <- zip3 us ts cs ]
Simple invocation from GHCi:
yahoo "Rosetta%20code" >>= printResults
. Notice that spaces must be expressed as "%20", because spaces are not allowed in URLs. ==Icon and {{header|Unicon}}== The following uses the Unicon pre-processor and messaging extensions and won't run under Icon without significant modification. The code provides a suitable demonstration; however, could be made more robust by things such as URL escaping the search string
link printf,strings
procedure main()
YS := YahooSearch("rosettacode")
every 1 to 2 do { # 2 pages
YS.readnext()
YS.showinfo()
}
end
class YahooSearch(urlpat,page,response) #: class for Yahoo Search
method readnext() #: read the next page of search results
self.page +:= 1 # can't find as w|w/o self
readurl()
end
method readurl() #: read the url
url := sprintf(self.urlpat,(self.page-1)*10+1)
m := open(url,"m") | stop("Unable to open : ",url)
every (self.response := "") ||:= |read(m)
close(m)
self.response := deletec(self.response,"\x00") # kill stray NULs
end
method showinfo() #: show the info of interest
self.response ? repeat {
(tab(find("<")) & ="<a class=\"yschttl spt\" href=\"") | break
url := tab(find("\"")) & tab(find(">")+1)
title := tab(find("<")) & ="</a></h3></div>"
tab(find("<")) & =("<div class=\"abstr\">" | "<div class=\"sm-abs\">")
abstr := tab(find("<")) & ="</div>"
printf("\nTitle : %i\n",title)
printf("URL : %i\n",url)
printf("Abstr : %i\n",abstr)
}
end
initially(searchtext) #: initialize each instance
urlpat := sprintf("http://search.yahoo.com/search?p=%s&b=%%d",searchtext)
page := 0
end
{{libheader|Icon Programming Library}} [http://www.cs.arizona.edu/icon/library/src/procs/printf.icn printf.icn provides formatting] [http://www.cs.arizona.edu/icon/library/src/procs/strings.icn strings.icn provides deletec]
Sample Output (truncated):
Title : "<b>Rosetta Code</b> - <b>Rosetta Code</b>"
URL : "http://rosettacode.org/"
Abstr : "<b>Rosetta Code</b> is a programming chrestomathy site. The idea is to
present solutions to the same task in as many different languages as possible, t
o demonstrate how ..."
Title : "<b>Rosetta Code</b> - Wikipedia, the free <wbr />encyclopedia"
URL : "http://en.wikipedia.org/wiki/Rosetta_Code"
Abstr : " <b>Rosetta Code</b> is a wiki -based programming chrestomathy website
with solutions to various programming problems in many different programming lan
guages. It was created ..."
Title : "Category:AutoHotkey - <b>Rosetta Code</b>"
URL : "http://rosettacode.org/wiki/Category:AutoHotkey"
Abstr : "Listed below are all of the tasks on <b>Rosetta Code</b> which have bee
n solved using AutoHotkey."
...
Title : "RosettaCON2011 Tutorials Collection | <wbr />RosettaCommons"
URL : "http://www.rosettacommons.org/"
Abstr : "Foldit in the news. Cooper et al. 2010 Predicting protein structures wi
th a multiplayer online game, Nature 466 , 756 see also video. Rosetta-3.3 is no
w available!"
Title : "CALL: call a SUBROUTINE - HicEst: <wbr />Windows IDE programming ..."
URL : "http://www.hicest.com/CALL.htm"
Abstr : "\xe2\x87\x92 Example of a CALL call in "Roman_numerals" (<b>R
osettaCode</b>) CALL transfers control to the first statement of a SUBROUTINE. C
ALL subroutine_name[argument1, ..."
Java
{{incorrect|Java}}
import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URISyntaxException; import java.net.URL; import java.net.URLDecoder; import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; class YahooSearch { private String query; // Page number private int page = 1; // Regexp to look for the individual results in the returned page private static final Pattern pattern = Pattern.compile( "<a class=\"yschttl spt\" href=\"[^*]+?\\*\\*([^\"]+?)\">(.+?)</a></h3>.*?<div class=\"(?:sm-abs|abstr)\">(.+?)</div>"); public YahooSearch(String query) { this.query = query; } public List<YahooResult> search() throws MalformedURLException, URISyntaxException, IOException { // Build the search string, starting with the Yahoo search URL, // then appending the query and optionally the page number (if > 1) StringBuilder searchUrl = new StringBuilder("http://search.yahoo.com/search?"); searchUrl.append("p=").append(URLEncoder.encode(query, "UTF-8")); if (page > 1) {searchUrl.append("&b=").append((page - 1) * 10 + 1);} // Query the Yahoo search engine URL url = new URL(searchUrl.toString()); List<YahooResult> result = new ArrayList<YahooResult>(); StringBuilder sb = new StringBuilder(); // Get the search results using a buffered reader BufferedReader in = null; try { in = new BufferedReader(new InputStreamReader(url.openStream())); // Read the results line by line String line = in.readLine(); while (line != null) { sb.append(line); line = in.readLine(); } } catch (IOException ioe) { ioe.printStackTrace(); } finally { try {in.close();} catch (Exception ignoreMe) {} } String searchResult = sb.toString(); // Look for the individual results by matching the regexp pattern Matcher matcher = pattern.matcher(searchResult); while (matcher.find()) { // Extract the result URL, title and excerpt String resultUrl = URLDecoder.decode(matcher.group(1), "UTF-8"); String resultTitle = matcher.group(2).replaceAll("</?b>", "").replaceAll("<wbr ?/?>", ""); String resultContent = matcher.group(3).replaceAll("</?b>", "").replaceAll("<wbr ?/?>", ""); // Create a new YahooResult and add to the list result.add(new YahooResult(resultUrl, resultTitle, resultContent)); } return result; } public List<YahooResult> search(int page) throws MalformedURLException, URISyntaxException, IOException { // Set the page number and search this.page = page; return search(); } public List<YahooResult> nextPage() throws MalformedURLException, URISyntaxException, IOException { // Increment the page number and search page++; return search(); } public List<YahooResult> previousPage() throws MalformedURLException, URISyntaxException, IOException { // Decrement the page number and search; if the page number is 1 return an empty list if (page > 1) { page--; return search(); } else return new ArrayList<YahooResult>(); } } class YahooResult { private URL url; private String title; private String content; public URL getUrl() { return url; } public void setUrl(URL url) { this.url = url; } public void setUrl(String url) throws MalformedURLException { this.url = new URL(url); } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } public YahooResult(URL url, String title, String content) { setUrl(url); setTitle(title); setContent(content); } public YahooResult(String url, String title, String content) throws MalformedURLException { setUrl(url); setTitle(title); setContent(content); } @Override public String toString() { StringBuilder sb = new StringBuilder(); if (title != null) { sb.append(",title=").append(title); } if (url != null) { sb.append(",url=").append(url); } return sb.charAt(0) == ',' ? sb.substring(1) : sb.toString(); } } public class TestYahooSearch { public static void main(String[] args) throws MalformedURLException, URISyntaxException, IOException { // Create a new search YahooSearch search = new YahooSearch("Rosetta code"); // Get the search results List<YahooResult> results = search.search(); // Show the search results for (YahooResult result : results) { System.out.println(result.toString()); } } }
Kotlin
{{incorrect|Kotlin}} This is based on the C# entry but uses a regular expression based on what appears to be the Yahoo! format as at the date of this entry (4 December 2017).
// version 1.2.0 import java.net.URL val rx = Regex("""<div class=\"yst result\">.+?<a href=\"(.*?)\" class=\"\">(.*?)</a>.+?class="abstract ellipsis">(.*?)</p>""") class YahooResult(var title: String, var link: String, var text: String) { override fun toString() = "\nTitle: $title\nLink : $link\nText : $text" } class YahooSearch(val query: String, val page: Int = 0) { private val content: String init { val yahoo = "http://search.yahoo.com/search?" val url = URL("${yahoo}p=$query&b=${page * 10 + 1}") content = url.readText() } val results: MutableList<YahooResult> get() { val list = mutableListOf<YahooResult>() for (mr in rx.findAll(content)) { val title = mr.groups[2]!!.value.replace("<b>", "").replace("</b>", "") val link = mr.groups[1]!!.value val text = mr.groups[3]!!.value.replace("<b>", "").replace("</b>", "") list.add (YahooResult(title, link, text)) } return list } fun nextPage() = YahooSearch(query, page + 1) fun getPage(newPage: Int) = YahooSearch(query, newPage) } fun main(args: Array<String>) { for (page in 0..1) { val x = YahooSearch("rosettacode", page) println("\nPAGE ${page + 1} =>") for (result in x.results.take(3)) println(result) } }
Output (restricted to first three results on first two pages):
PAGE 1 =>
Title: Rosetta Code - Official Site
Link : http://rosettacode.org/wiki/Rosetta_Code
Text : Rosetta Code is a programming chrestomathy site. The idea is to present solutions to the same task in as ...
Title: Rosetta Code - Wikipedia
Link : https://en.wikipedia.org/wiki/Rosetta_Code
Text : Rosetta Code is a wiki-based programming chrestomathy website with implementations of common algorithms ...
Title: Rosetta Code (@rosettacode) | Twitter
Link : https://twitter.com/rosettacode
Text : The latest Tweets from Rosetta Code (@rosettacode). Twitter account for http://t.co/DuRZFWDfRn. The ...
PAGE 2 =>
Title: Rosetta Code Blog
Link : http://blog.rosettacode.org/
Text : As I noted, there was an expectation of downtime as the VPS hostRosetta Code sits on moved from one data ...
Title: Rosetta Code - Wikipedia
Link : https://en.wikipedia.org/wiki/User:Paddy3118/Rosetta_Code
Text : Rosetta Code is a wiki-based programming chrestomathy website with implementations of common algorithms ...
Title: Rosetta Code and ABAP | SAP Blogs
Link : https://blogs.sap.com/2015/03/27/rosetta-code-and-abap/
Text : Last week Christian Drumm (@ceedee666) and Fred Verheul (@fredverheul) had a short conversation on ...
Mathematica
We cannot define a class in Mathematica, so I generate a "Manipulate" object instead.