Yahoo! search interface

⚠️ Warning: This is a draft ⚠️

This means it might contain formatting issues, incorrect code, conceptual problems, or other severe issues.

If you want to help to improve and eventually enable this page, please fork RosettaGit's repository and open a merge request on GitHub.

{{task|Programming environment operations}}[[Category:Networking and Web Interaction]]

Create a class for searching Yahoo! results.

It must implement a '''Next Page''' method, and read URL, Title and Content from results.

AutoHotkey

translated from python example

test:
yahooSearch("test", 1)
yahooSearch("test", 2)
return

yahooSearch(query, page)
{
  global
  start := ((page - 1) * 10) + 1
  filedelete, search.txt
  urldownloadtofile, % "http://search.yahoo.com/search?p=" . query
  . "&b=" . start, search.txt
  fileread, content, search.txt
  reg = <a class="yschttl spt" href=".+?" >(.+?)</a></h3></div><div class="abstr">(.+?)</div><span class=url>(.+?)</span>

  index := found := 1
  while (found := regexmatch(content, reg, self, found + 1))
  {
    msgbox % title%A_Index% := fix(self1)
    content%A_Index% := fix(self2)
    url%A_Index% := fix(self3)
  }
}

fix(url)
{
if pos := instr(url, "</a></h3></div>")
StringLeft, url, url, pos - 1
url := regexreplace(url, "<.*?>")
return url
}

C#

Generally it is not a good idea to scrape web pages. E. g. all implementations for this task which regex for "<a class=" fail by now, after Yahoo has changed its output format.

using System;
using System.Net;
using System.Text.RegularExpressions;
using System.Collections.Generic;

class YahooSearch {
    private string query;
    private string content;
    private int page;

    const string yahoo = "http://search.yahoo.com/search?";

    public YahooSearch(string query) : this(query, 0) { }

    public YahooSearch(string query, int page) {
        this.query = query;
        this.page = page;
        this.content = new WebClient()
            .DownloadString(
                string.Format(yahoo + "p={0}&b={1}", query, this.page * 10 + 1)
            );
    }

    public YahooResult[] Results {
        get {
            List<YahooResult> results = new List<YahooResult>();

            Func<string, string, string> substringBefore = (str, before) =>
            {
                int iHref = str.IndexOf(before);
                return iHref < 0 ? "" : str.Substring(0, iHref);
            };
            Func<string, string, string> substringAfter = (str, after) =>
            {
                int iHref = str.IndexOf(after);
                return iHref < 0 ? "" : str.Substring(iHref + after.Length);
            };
            Converter<string, string> getText = p =>
                Regex.Replace(p, "<[^>]*>", x => "");

            Regex rx = new Regex(@"
                <li>
                    <div \s class=""res"">
                        <div>
                            <h3>
                                <a \s (?'LinkAttributes'[^>]+)>
                                    (?'LinkText' .*?)
                                (?></a>)
                            </h3>
                        </div>
                        <div \s class=""abstr"">
                            (?'Abstract' .*?)
                        (?></div>)
                        .*?
                    (?></div>)
                </li>",
                RegexOptions.IgnorePatternWhitespace
                | RegexOptions.ExplicitCapture
            );
            foreach (Match e in rx.Matches(this.content)) {
                string rurl = getText(substringBefore(substringAfter(
                    e.Groups["LinkAttributes"].Value, @"href="""), @""""));
                string rtitle = getText(e.Groups["LinkText"].Value);
                string rcontent = getText(e.Groups["Abstract"].Value);

                results.Add(new YahooResult(rurl, rtitle, rcontent));
            }
            return results.ToArray();
        }
    }

    public YahooSearch NextPage() {
        return new YahooSearch(this.query, this.page + 1);
    }

    public YahooSearch GetPage(int page) {
        return new YahooSearch(this.query, page);
    }
}

class YahooResult {
    public string URL { get; set; }
    public string Title { get; set; }
    public string Content { get; set; }

    public YahooResult(string url, string title, string content) {
        this.URL = url;
        this.Title = title;
        this.Content = content;
    }

    public override string ToString()
    {
        return string.Format("\nTitle: {0}\nLink:  {1}\nText:  {2}",
            Title, URL, Content);
    }
}

// Usage:

class Prog {
    static void Main() {
        foreach (int page in new[] { 0, 1 })
        {
            YahooSearch x = new YahooSearch("test", page);

            foreach (YahooResult result in x.Results)
            {
                Console.WriteLine(result);
            }
        }
    }
}

D

import std.stdio, std.exception, std.regex, std.algorithm, std.string,
       std.net.curl;

struct YahooResult {
    string url, title, content;

    string toString() const {
        return "\nTitle: %s\nLink:  %s\nText:  %s"
               .format(title, url, content);
    }
}

struct YahooSearch {
    private string query, content;
    private uint page;

    this(in string query_, in uint page_ = 0) {
        this.query = query_;
        this.page = page_;
        this.content = "http://search.yahoo.com/search?p=%s&b=%d"
                       .format(query, page * 10 + 1).get.assumeUnique;
    }

    @property results() const {
        immutable re = `<li>
                          <div \s class="res">
                            <div>
                              <h3>
                                <a \s (?P<linkAttributes> [^>]+)>
                                  (?P<linkText> .*?)
                                </a>
                              </h3>
                            </div>
                            <div \s class="abstr">
                              (?P<abstract> .*?)
                            </div>
                            .*?
                          </div>
                        </li>`;

        const clean = (string s) => s.replace("<[^>]*>".regex("g"),"");

        return content.match(re.regex("gx")).map!(m => YahooResult(
            clean(m.captures["linkAttributes"]
                  .findSplitAfter(`href="`)[1]
                  .findSplitBefore(`"`)[0]),
            clean(m.captures["linkText"]),
            clean(m.captures["abstract"])
        ));
    }

    YahooSearch nextPage() const {
        return YahooSearch(query, page + 1);
    }
}

void main() {
    writefln("%(%s\n%)", "test".YahooSearch.results);
}


Title: Test.com
Link:  http://www.test.com/
Text:  Test.com provides a complete software solution for creating online tests and managing enterprise and specialist certification programs, in up to 22 languages.

Title: Speakeasy Speed Test
Link:  http://www.speakeasy.net/speedtest/
Text:  Test your Internet Connection with Speakeasy&#39;s reliable and accurate broadband speed test. What&#39;s your speed?

Title: Test | Define Test at Dictionary.com
Link:  http://dictionary.reference.com/browse/test
Text:  noun 1. the means by which the presence, quality, or genuineness of anything is determined; a means of trial. 2. the trial of the quality of something: to put to the ...

Gambas

Public Sub Form_Open()
Dim hWebView As WebView

Me.Arrangement = Arrange.Fill
Me.Maximized = True
Me.Title = "Yahoo! search interface"

hWebView = New WebView(Me)
hWebView.Expand = True
hWebView.URL = "https://www.yahoo.com"

End

'''[http://www.cogier.com/gambas/Yahoo!%20search%20interface.png Click here to see output (I have typed 'rosettacode' in the search box)]'''

GUISS

Start,Programs,Applications,Mozilla Firefox,Inputbox:address bar>www.yahoo.co.uk,
Button:Go,Area:browser window,Inputbox:searchbox>elephants,Button:Search

Haskell

Haskell is not an object oriented language, so this example does not implement an object class. However, it can be interesting as an example of how HTML source code can be parsed using the Parsec library.

import Network.HTTP
import Text.Parsec

data YahooSearchItem = YahooSearchItem {
    itemUrl, itemTitle, itemContent :: String }

data YahooSearch = YahooSearch {
    searchQuery :: String,
    searchPage :: Int,
    searchItems :: [YahooSearchItem] }

-- URL for Yahoo! searches, without giving a page number
yahooUrl = "http://search.yahoo.com/search?p="

-- make an HTTP request and return a YahooSearch
yahoo :: String -> IO YahooSearch
yahoo q = simpleHTTP (getRequest $ yahooUrl ++ q) >>=
    getResponseBody >>= return . YahooSearch q 1 . items

-- get some results and return the next page of results
next :: YahooSearch -> IO YahooSearch
next (YahooSearch q p _) =
    simpleHTTP (getRequest $
    -- add the page number to the search
    yahooUrl ++ q ++ "&b=" ++ show (p + 1)) >>=
    getResponseBody >>= return . YahooSearch q (p + 1) . items

printResults :: YahooSearch -> IO ()
printResults (YahooSearch q p items) = do
    putStrLn $ "Showing Yahoo! search results for query: " ++ q
    putStrLn $ "Page: " ++ show p
    putChar '\n'
    mapM_ printOne items
    where
        printOne (YahooSearchItem itemUrl itemTitle itemContent) = do
            putStrLn $ "URL   : " ++ itemUrl
            putStrLn $ "Title : " ++ itemTitle
            putStrLn $ "Abstr : " ++ itemContent
            putChar '\n'

urlTag, titleTag, contentTag1, contentTag2, ignoreTag,
    ignoreText :: Parsec String () String

-- parse a tag containing the URL of a search result
urlTag = do { string "<a id=\"link-";
    many digit; string "\" class=\"yschttl spt\" href=\"";
    url <- manyTill anyChar (char '"'); manyTill anyChar (char '>');
    return url }

-- the title comes after the URL tag, so parse it first, discard it
-- and get the title text
titleTag = do { urlTag; manyTill anyChar (try (string "</a>")) }

-- parse a tag containing the description of the search result
-- the tag can be named "sm-abs" or "abstr"
contentTag1 = do { string "<div class=\"sm-abs\">";
    manyTill anyChar (try (string "</div>")) }

contentTag2 = do { string "<div class=\"abstr\">";
    manyTill anyChar (try (string "</div>")) }

-- parse a tag and discard it
ignoreTag = do { char ('<'); manyTill anyChar (char '>');
    return "" }

-- parse some text and discard it
ignoreText = do { many1 (noneOf "<"); return "" }

-- return only non-empty strings
nonempty :: [String] -> Parsec String () [String]
nonempty xs = return [ x | x <- xs, not (null x) ]

-- a template to parse a whole source file looking for items of the
-- same class
parseCategory x = do
    res <- many x
    eof
    nonempty res

urls, titles, contents :: Parsec String () [String]

-- parse HTML source looking for URL tags of the search results
urls = parseCategory url where
    url = (try urlTag) <|> ignoreTag <|> ignoreText

-- parse HTML source looking for titles of the search results
titles = parseCategory title where
    title = (try titleTag) <|> ignoreTag <|> ignoreText

-- parse HTML source looking for descriptions of the search results
contents = parseCategory content where
    content = (try contentTag1) <|> (try contentTag2) <|>
        ignoreTag <|> ignoreText

-- parse the HTML source three times looking for URL, title and
-- description of all search results and return them as a list of
-- YahooSearchItem
items :: String -> [YahooSearchItem]
items q =
    let ignoreOrKeep = either (const []) id
        us = ignoreOrKeep $ parse urls "" q
        ts = ignoreOrKeep $ parse titles "" q
        cs = ignoreOrKeep $ parse contents "" q
    in [ YahooSearchItem { itemUrl = u, itemTitle = t, itemContent = c } |
        (u, t, c) <- zip3 us ts cs ]

Simple invocation from GHCi:

yahoo "Rosetta%20code" >>= printResults

. Notice that spaces must be expressed as "%20", because spaces are not allowed in URLs. ==Icon and {{header|Unicon}}== The following uses the Unicon pre-processor and messaging extensions and won't run under Icon without significant modification. The code provides a suitable demonstration; however, could be made more robust by things such as URL escaping the search string

link printf,strings

procedure main()
YS := YahooSearch("rosettacode")
every 1 to 2 do {   # 2 pages
   YS.readnext()
   YS.showinfo()
   }
end

class YahooSearch(urlpat,page,response)  #: class for Yahoo Search

   method readnext()    #: read the next page of search results
      self.page +:= 1   # can't find as w|w/o self
      readurl()
   end

   method readurl()     #: read the url
      url := sprintf(self.urlpat,(self.page-1)*10+1)
      m := open(url,"m")  | stop("Unable to open : ",url)
      every (self.response := "") ||:= |read(m)
      close(m)
      self.response := deletec(self.response,"\x00") # kill stray NULs
   end

   method showinfo()    #: show the info of interest
      self.response ? repeat {
         (tab(find("<")) & ="<a class=\"yschttl spt\" href=\"") | break
         url   := tab(find("\"")) & tab(find(">")+1)
         title := tab(find("<")) & ="</a></h3></div>"
         tab(find("<")) & =("<div class=\"abstr\">" | "<div class=\"sm-abs\">")
         abstr := tab(find("<")) & ="</div>"

         printf("\nTitle : %i\n",title)
         printf("URL   : %i\n",url)
         printf("Abstr : %i\n",abstr)
         }
   end

initially(searchtext)    #: initialize each instance
   urlpat := sprintf("http://search.yahoo.com/search?p=%s&b=%%d",searchtext)
   page := 0
end

{{libheader|Icon Programming Library}} [http://www.cs.arizona.edu/icon/library/src/procs/printf.icn printf.icn provides formatting] [http://www.cs.arizona.edu/icon/library/src/procs/strings.icn strings.icn provides deletec]

Sample Output (truncated):


Title : "<b>Rosetta Code</b> - <b>Rosetta Code</b>"
URL   : "http://rosettacode.org/"
Abstr : "<b>Rosetta Code</b> is a programming chrestomathy site. The idea is to
present solutions to the same task in as many different languages as possible, t
o demonstrate how ..."

Title : "<b>Rosetta Code</b> - Wikipedia, the free <wbr />encyclopedia"
URL   : "http://en.wikipedia.org/wiki/Rosetta_Code"
Abstr : " <b>Rosetta Code</b> is a wiki -based programming chrestomathy website
with solutions to various programming problems in many different programming lan
guages. It was created ..."

Title : "Category:AutoHotkey - <b>Rosetta Code</b>"
URL   : "http://rosettacode.org/wiki/Category:AutoHotkey"
Abstr : "Listed below are all of the tasks on <b>Rosetta Code</b> which have bee
n solved using AutoHotkey."

...


Title : "RosettaCON2011 Tutorials Collection | <wbr />RosettaCommons"
URL   : "http://www.rosettacommons.org/"
Abstr : "Foldit in the news. Cooper et al. 2010 Predicting protein structures wi
th a multiplayer online game, Nature 466 , 756 see also video. Rosetta-3.3 is no
w available!"

Title : "CALL: call a SUBROUTINE - HicEst: <wbr />Windows IDE programming ..."
URL   : "http://www.hicest.com/CALL.htm"
Abstr : "\xe2\x87\x92 Example of a CALL call in &quot;Roman_numerals&quot; (<b>R
osettaCode</b>) CALL transfers control to the first statement of a SUBROUTINE. C
ALL subroutine_name[argument1, ..."

Java

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

class YahooSearch {
    private String query;
    // Page number
    private int page = 1;
    // Regexp to look for the individual results in the returned page
    private static final Pattern pattern = Pattern.compile(
        "<a class=\"yschttl spt\" href=\"[^*]+?\\*\\*([^\"]+?)\">(.+?)</a></h3>.*?<div class=\"(?:sm-abs|abstr)\">(.+?)</div>");

    public YahooSearch(String query) {
        this.query = query;
    }

    public List<YahooResult> search() throws MalformedURLException, URISyntaxException, IOException {
        // Build the search string, starting with the Yahoo search URL,
        // then appending the query and optionally the page number (if > 1)
        StringBuilder searchUrl = new StringBuilder("http://search.yahoo.com/search?");
        searchUrl.append("p=").append(URLEncoder.encode(query, "UTF-8"));
        if (page > 1) {searchUrl.append("&b=").append((page - 1) * 10 + 1);}
        // Query the Yahoo search engine
        URL url = new URL(searchUrl.toString());
        List<YahooResult> result = new ArrayList<YahooResult>();
        StringBuilder sb = new StringBuilder();
        // Get the search results using a buffered reader
        BufferedReader in = null;
        try {
            in = new BufferedReader(new InputStreamReader(url.openStream()));
            // Read the results line by line
            String line = in.readLine();
            while (line != null) {
                sb.append(line);
                line = in.readLine();
            }
        }
        catch (IOException ioe) {
            ioe.printStackTrace();
        }
        finally {
            try {in.close();} catch (Exception ignoreMe) {}
        }
        String searchResult = sb.toString();
        // Look for the individual results by matching the regexp pattern
        Matcher matcher = pattern.matcher(searchResult);
        while (matcher.find()) {
            // Extract the result URL, title and excerpt
            String resultUrl = URLDecoder.decode(matcher.group(1), "UTF-8");
            String resultTitle = matcher.group(2).replaceAll("</?b>", "").replaceAll("<wbr ?/?>", "");
            String resultContent = matcher.group(3).replaceAll("</?b>", "").replaceAll("<wbr ?/?>", "");
            // Create a new YahooResult and add to the list
            result.add(new YahooResult(resultUrl, resultTitle, resultContent));
        }
        return result;
    }

    public List<YahooResult> search(int page) throws MalformedURLException, URISyntaxException, IOException {
        // Set the page number and search
        this.page = page;
        return search();
    }

    public List<YahooResult> nextPage() throws MalformedURLException, URISyntaxException, IOException {
        // Increment the page number and search
        page++;
        return search();
    }

    public List<YahooResult> previousPage() throws MalformedURLException, URISyntaxException, IOException {
        // Decrement the page number and search; if the page number is 1 return an empty list
        if (page > 1) {
            page--;
            return search();
        } else return new ArrayList<YahooResult>();
    }
}

class YahooResult {
    private URL url;
    private String title;
    private String content;

    public URL getUrl() {
        return url;
    }

    public void setUrl(URL url) {
        this.url = url;
    }

    public void setUrl(String url) throws MalformedURLException {
        this.url = new URL(url);
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }

    public YahooResult(URL url, String title, String content) {
        setUrl(url);
        setTitle(title);
        setContent(content);
    }

    public YahooResult(String url, String title, String content) throws MalformedURLException {
        setUrl(url);
        setTitle(title);
        setContent(content);
    }

    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();
        if (title != null) {
            sb.append(",title=").append(title);
        }
        if (url != null) {
            sb.append(",url=").append(url);
        }
        return sb.charAt(0) == ',' ? sb.substring(1) : sb.toString();
    }
}

public class TestYahooSearch {
    public static void main(String[] args) throws MalformedURLException, URISyntaxException, IOException {
        // Create a new search
        YahooSearch search = new YahooSearch("Rosetta code");
        // Get the search results
        List<YahooResult> results = search.search();
        // Show the search results
        for (YahooResult result : results) {
            System.out.println(result.toString());
        }
    }
}

Kotlin

{{incorrect|Kotlin}} This is based on the C# entry but uses a regular expression based on what appears to be the Yahoo! format as at the date of this entry (4 December 2017).

// version 1.2.0

import java.net.URL

val rx = Regex("""<div class=\"yst result\">.+?<a href=\"(.*?)\" class=\"\">(.*?)</a>.+?class="abstract ellipsis">(.*?)</p>""")

class YahooResult(var title: String, var link: String, var text: String) {

    override fun toString() = "\nTitle: $title\nLink : $link\nText : $text"
}

class YahooSearch(val query: String, val page: Int = 0) {

    private val content: String

    init {
        val yahoo = "http://search.yahoo.com/search?"
        val url = URL("${yahoo}p=$query&b=${page * 10 + 1}")
        content = url.readText()
    }

    val results: MutableList<YahooResult>
        get() {
            val list = mutableListOf<YahooResult>()
            for (mr in rx.findAll(content)) {
               val title = mr.groups[2]!!.value.replace("<b>", "").replace("</b>", "")
               val link  = mr.groups[1]!!.value
               val text  = mr.groups[3]!!.value.replace("<b>", "").replace("</b>", "")
               list.add (YahooResult(title, link, text))
            }
            return list
        }

    fun nextPage() = YahooSearch(query, page + 1)

    fun getPage(newPage: Int) = YahooSearch(query, newPage)
}

fun main(args: Array<String>) {
    for (page in 0..1) {
        val x = YahooSearch("rosettacode", page)
        println("\nPAGE ${page + 1} =>")
        for (result in x.results.take(3)) println(result)
    }
}

Output (restricted to first three results on first two pages):

PAGE 1 =>

Title: Rosetta Code - Official Site
Link : http://rosettacode.org/wiki/Rosetta_Code
Text : Rosetta Code is a programming chrestomathy site. The idea is to present solutions to the same task in as ...

Title: Rosetta Code - Wikipedia
Link : https://en.wikipedia.org/wiki/Rosetta_Code
Text : Rosetta Code is a wiki-based programming chrestomathy website with implementations of common algorithms ...

Title: Rosetta Code (@rosettacode) | Twitter
Link : https://twitter.com/rosettacode
Text : The latest Tweets from Rosetta Code (@rosettacode). Twitter account for http://t.co/DuRZFWDfRn. The ...

PAGE 2 =>

Title: Rosetta Code Blog
Link : http://blog.rosettacode.org/
Text : As I noted, there was an expectation of downtime as the VPS hostRosetta Code sits on moved from one data ...

Title: Rosetta Code - Wikipedia
Link : https://en.wikipedia.org/wiki/User:Paddy3118/Rosetta_Code
Text : Rosetta Code is a wiki-based programming chrestomathy website with implementations of common algorithms ...

Title: Rosetta Code and ABAP | SAP Blogs
Link : https://blogs.sap.com/2015/03/27/rosetta-code-and-abap/
Text : Last week Christian Drumm (@ceedee666) and Fred Verheul (@fredverheul) had a short conversation on ...

Mathematica

We cannot define a class in Mathematica, so I generate a "Manipulate" object instead. Manipulate[ Column[Flatten[ StringCases[ StringCases[ URLFetch[ "http://search.yahoo.com/search?p=" <> query <> "&b=" <> ToString@page], "<ol" ~~ ___ ~~ ""], "<a" ~~ Shortest[] ~~ "class="yschttl spt" href="" ~~ Shortest[url] ~~ """ ~~ Shortest[] ~~ ">" ~~ Shortest[title] ~~ "<div class="abstr">" | "<div class="sm-abs">" ~~ Shortest[abstr__] ~~ "

AutoHotkey

C#

D

Gambas

GUISS

Haskell

Java

Kotlin

Mathematica

Perl

PicoLisp

Python

R

Racket

REPL:

Ruby

Run BASIC

Tcl

TXR