# Infoseek.pm # Copyright (C) 1998 by Martin Thurn # $Id: Infoseek.pm,v 1.26 1999/12/10 14:26:13 mthurn Exp $ =head1 NAME WWW::Search::Infoseek - class for searching Infoseek =head1 SYNOPSIS use WWW::Search; my $oSearch = new WWW::Search('Infoseek'); my $sQuery = WWW::Search::escape_query("+sushi restaurant +Columbus Ohio"); $oSearch->native_query($sQuery); while (my $oResult = $oSearch->next_result()) { print $oResult->url, "\n"; } =head1 DESCRIPTION This class is a Infoseek specialization of WWW::Search. It handles making and interpreting Infoseek searches F. This class exports no public interface; all interaction should be done through L objects. =head1 SEE ALSO L L L To make new back-ends, see L. =head1 BUGS Please tell the author if you find any! =head1 TESTING This module adheres to the C test suite mechanism. See C for test cases for the default usage. =head1 AUTHOR C is maintained by Martin Thurn (MartinThurn@iname.com). =head1 LEGALESE THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. =head1 VERSION HISTORY If it is not listed here, then it was not a meaningful nor released revision. =head2 2.06, 1999-12-10 handle infoseek.com's slight output format change =head2 2.05, 1999-10-05 BUGFIX: parser for ::Companies and ::News; now uses hash_to_cgi_string() =head2 2.04, 1999-09-29 BUGFIX: handle descriptions with embedded \n; ignore "company profile" URLs during Web search =head2 2.02, 1999-09-28 BUGFIX: was going to the "previous" page instead of the "next" page! =head2 2.01, 1999-07-13 =head2 1.18, 1999-07-09 BUGFIX for not seeing all the URLs on a page, and fetching the first page of results over and over(!). =head2 1.16, 1999-06-30 Now strips HTML tags from titles and descriptions. =head2 1.14, 1999-06-29 Companies and News searches now work! =head2 1.13, 1999-06-28 www.infoseek.com changed their output format ever so slightly. Companies and News searches return URLs, but titles and descriptions are unreliable. =head2 1.12, 1999-05-21 www.infoseek.com changed their output format. =head2 1.11, 1999-04-27 Fixed BUG in parsing of News search results. Added version method. =head2 1.08, 1999-02-09 Fixed BUG in parsing of Companies search results. Thanks to Jim Smyser (jsmyser@bigfoot.com) for pointing it out. =head2 1.7, 1998-10-05 www.infoseek.com changed their output format. Thanks to Andreas Borchert (borchert@mathematik.uni-ulm.de) for patches. =head2 1.6, 1998-09-18 Fixed BUG where (apparently) no titles were retrieved. =head2 1.5 www.infoseek.com changed their output format ever-so-slightly. =head2 1.3 First publicly-released version. =cut ##################################################################### package WWW::Search::Infoseek; require Exporter; @EXPORT = qw(); @EXPORT_OK = qw(); @ISA = qw(WWW::Search Exporter); $VERSION = '2.05'; use Carp (); use WWW::Search(qw( generic_option strip_tags )); require WWW::SearchResult; use URI; # private sub native_setup_search { my ($self, $native_query, $rhOptions) = @_; # WARNING: www.infoseek.com returns 25 hits per page no matter what number # you send in the argument list! my $DEFAULT_HITS_PER_PAGE = 25; # $DEFAULT_HITS_PER_PAGE = 10; # for debugging $self->{'_hits_per_page'} = $DEFAULT_HITS_PER_PAGE; $self->{agent_e_mail} = 'MartinThurn@iname.com'; # www.Infoseek.com doesn't like robots: response from server was 403 # (Forbidden) Forbidden by robots.txt $self->user_agent(1); $self->{'_next_to_retrieve'} = 0; $self->{'_num_hits'} = 0; # Remove '*' at end of query terms within the user's query. If the # query string is not escaped (even though it's supposed to be), # change '* ' to ' ' at end of words and at the end of the string. # If the query string is escaped, change '%2A+' to '+' at end of # words and delete '%2A' at the end of the string. $native_query =~ s/(\w)\052\s/$1\040/g; $native_query =~ s/(\w)\052$/$1\040/g; $native_query =~ s/(\w)\0452A\053/$1\053/g; $native_query =~ s/(\w)\0452A$/$1/g; if (!defined($self->{_options})) { # These are the defaults: $self->{_options} = { 'search_url' => 'http://www.infoseek.com/Titles', 'qt' => $native_query, 'st' => $self->{'_next_to_retrieve'}, 'nh' => $self->{'_hits_per_page'}, 'rf' => '0', 'col' => 'WW', }; } # if # Copy in options passed in the argument list: if (defined($rhOptions)) { foreach (keys %$rhOptions) { $self->{'_options'}->{$_} = $rhOptions->{$_}; } # foreach } # if # Copy in options which were set by a child object: if (defined($self->{'_child_options'})) { foreach (keys %{$self->{'_child_options'}}) { $self->{'_options'}->{$_} = $self->{'_child_options'}->{$_}; } # foreach } # if # Finally figure out the url. $self->{_next_url} = $self->{_options}{'search_url'} .'?'. $self->hash_to_cgi_string($self->{_options}); # Set some private variables: $self->{_debug} = $self->{'_options'}->{'search_debug'}; $self->{_debug} = 2 if ($self->{'_options'}->{'search_parse_debug'}); $self->{_debug} = 0 if (!defined($self->{_debug})); } # native_setup_search # private sub native_retrieve_some { my ($self) = @_; # Fast exit if already done: return undef unless defined($self->{_next_url}); # A macro for HTML whitespace: my $SPACE = '( |\s)+'; # If this is not the first page of results, sleep so as to not overload the server: $self->user_agent_delay if 1 < $self->{'_next_to_retrieve'}; # Get some results, adhering to the WWW::Search mechanism: print STDERR " * sending request (",$self->{_next_url},")\n" if $self->{'_debug'}; my($response) = $self->http_request('GET', $self->{_next_url}); $self->{response} = $response; if (!$response->is_success) { return undef; }; print STDERR " * got response\n" if $self->{'_debug'}; $self->{'_next_url'} = undef; # Parse the output my ($START, $HEADER, $HITS, $DESC,$PERCENT,$SIZE,$DATE, $NEXT,$COMP_NEXT, $TRAILER, $WEB_HITS, $WEB_NEXT, $DESC_SPLIT) = qw( ST HE HI DE PE SI DA NE CN TR WH WN DS ); my $hits_found = 0; my $state = $START; my $hit; my $sContent = $response->content(); $sContent =~ s/\/\n/g; my $sPrevLine = ''; foreach ($self->split_lines($sContent)) { next if m/^$/; # short circuit for blank lines if ($state eq $DESC_SPLIT) { # On the previous line we noticed that the description contained # a \n. Prepend that previous line onto this line and continue # as normal. $_ = $sPrevLine .' '. $_; $state = $WEB_HITS; $sPrevLine = ''; } # if print STDERR " * $state ===$_===" if 2 <= $self->{'_debug'}; if ($state eq $START && m=web\ssearch\sresults=i && m=of\s+\([\d,]+)\\s+results=i) { # Actual line of input is: #  Web search results    1 - 10 of 99 results most relevant to martin thurn   print STDERR "web header line\n" if 2 <= $self->{'_debug'}; my $iCount = $1; $iCount =~ s/,//g; $self->approximate_result_count($iCount); $state = $NEXT; next; } # we're in START mode, and line has number of WEB results if ($state eq $START && m=\>Search\sresults<=i) { # Actual line of input is: #  Search results  $state = $NEXT; } elsif ($state eq $START && m=\>\d+\s+-\s+\d+\s+of\s+\([0-9,]+)=) { # Actual line of input is: # ARTICLES 1 - 25 of 1,239 total articles

#  Web search results    1 - 25 of 97 results most relevant to Martin Thurn   print STDERR "header line\n" if 2 <= $self->{'_debug'}; my $iCount = $1; $iCount =~ tr/[^0-9]//; $self->approximate_result_count($1); $state = $HEADER; next; } # we're in START mode, and line has number of results # if ($state eq $HEADER && # m@roup\sthese\sresults@) # { # # Actual line of input is: # # Ungroup these results # print STDERR "group/ungroup line\n" if 2 <= $self->{'_debug'}; # $state = $NEXT; # next; # } # we're in HEADER mode, and line talks about (un)grouping results # if ($state eq $HEADER && # m@>Hide\ssummaries<@i) # { # # Actual line of input is: # # Hide summaries # print STDERR "show/hide summaries line\n" if 2 <= $self->{'_debug'}; # $state = $COMP_NEXT; # next; # } # we're in HEADER mode, and line talks about hide summaries if ($state eq $HEADER && m/^$/i) { print STDERR " End RHS line\n" if 2 <= $self->{'_debug'}; $state = $NEXT; next; } if ((($state eq $NEXT) || ($state eq $WEB_NEXT)) && s@\]*?)\"\> < Previous$SPACE\d+@WWWSEARCHDELETED@i) { print STDERR " deleted 'previous' link\n" if 2 <= $self->{'_debug'}; # Stay on this line of input! } if ((($state eq $NEXT) || ($state eq $WEB_NEXT)) && s@\]*?)\"\>Next$SPACE\d+@WWWSEARCHDELETED@i) { # Actual line of input is: # Next 10 >  |  ... # < Previous 25  | Next 25 >  |  Hide summaries  |   Sort by date  |  Group results

Countdown to Star Wars: Episode One: The Phantom Menace
Fan site features the very latest 'Phantom Menace' news and a large multimedia archive. Also, live web-cams of Star Wars fans waiting in line.
74%  Date: 30 Jun 1999,  Size 4.3K,  http://starwars.countingdown.com/ 
Find similar pages  |  Translate this page print STDERR " found 'next' link\n" if 2 <= $self->{'_debug'}; # There is a "next" link on this page, therefore there are # indeed more results for us to go after next time. $self->{_next_url} = $1; $state = $WEB_HITS; # Stay on this line of input! } elsif ($state eq $NEXT && (s@^.*?\]*?)\"\>Group\sresults@WWWSEARCHDELETED@i || m!\">Hide\ssummaries!i)) { print STDERR " no 'next' link\n" if 2 <= $self->{'_debug'}; $self->{_next_url} = undef; $state = $WEB_HITS; # Stay on this line of input! } if ($state eq $WEB_HITS && m!\\]*?)\"\>(.*?)\!i) { # Sample line of input: # ...

American Home Mortgage Holdings, Inc.:  company profile
http://www.mortgageselect.com # 1. WWW::Search
WWW::Search is a collection of Perl modules which provide an API to WWW search engines like AltaVista, Lycos, Hotbot, WebCrawler, and so on. Currently WWW::Search includes back-ends for variations of AltaVista, Lycos, ...
Relevance: 65%  Date: 26 Jul 1999,  Size 13.5K,  http://www.isi.edu/lsam/tools/WWW_SEARCH/ 
Find similar pages  |  Translate this page my ($sURL,$sTitle) = ($1,$2); if ($sURL =~ m/infoseek\.go\.com/ && $sTitle =~ m/(Next|Previous)\s\d+/) { print STDERR " ignoring '$1 page' link\n" if 2 <= $self->{'_debug'}; next; } # if if (m!>company profile{'_debug'}; next; } # if print STDERR " webhit URL line\n" if 2 <= $self->{'_debug'}; if (($self->{_options}->{'col'} eq 'WW') && (! m!$!i)) { print STDERR " SPLIT DESCRIPTION!!!\n" if 2 <= $self->{'_debug'}; # There is a \n in the middle of the description. We need to # append the next line onto this line and try again... $sPrevLine = $_; chomp $sPrevLine; $state = $DESC_SPLIT; next; } # if if (defined($hit)) { push(@{$self->{cache}}, $hit); $self->{'_num_hits'}++; } # if $hits_found++; $hit = new WWW::SearchResult; my $sURLabs = URI->new_abs($sURL, $self->{_options}{search_url}); $hit->add_url($sURLabs); $hit->title(strip_tags($sTitle)); $state = $DESC; $hit->score($1) if (m/(\d+)\%$SPACE/i); $hit->change_date($1) if (m/Date:\s(.*?)[^a-zA-Z0-9\s]/i); $hit->description(strip_tags($1)) if (s!\(.*?)\!!); if (m/Size\s(\S+?),/i) { my $size = $1; $size =~ s/K/*1024/; $size =~ s/M/*1024*1024/; $hit->size(int eval $size); $state = $WEB_HITS; } # if } elsif ($state eq $DESC && s!^\(.*?)\!!) { print STDERR " description line\n" if 2 <= $self->{'_debug'}; $hit->description(strip_tags($1)); $hit->change_date($1) if (m/^\(.*?)\s /i); $state = $WEB_HITS; } # if # if (($state eq $NEXT || $state eq $COMP_NEXT) && m=^\s*
\s*$=i) # { # print STDERR " no next button\n" if 2 <= $self->{'_debug'}; # # There is no next button. # $state = $HITS; # } elsif ($state eq $COMP_NEXT && m=^\$=) { print STDERR " no next button (company mode)\n" if 2 <= $self->{'_debug'}; # There is no next button. $state = $HITS; } elsif ($state eq $COMP_NEXT && m=^\\\$=) # afb 10/98 { print STDERR " no next button (web mode)\n" if 2 <= $self->{'_debug'}; # There is no next button. $state = $HITS; } elsif ($state eq $HITS && m=\Articles\\s+\d+\s+-\s+\d+\s+of\s+\d+=) { # Actual line of input is: # Articles 51 - 100 of 104 print STDERR "article count line\n" if 2 <= $self->{'_debug'}; $state = $TRAILER; } elsif ($state eq $HITS && m/xxxxxx xxxxxx xxxxxx/) { print STDERR "xxxxxx line\n" if 2 <= $self->{'_debug'}; $state = $TRAILER; } elsif ($state eq $HITS && m/\>Hide\ssummaries\{'_debug'}; # $state = $TRAILER; } elsif ($state eq $HITS && m|\\([^\<]+)|i) { print STDERR "oldhit url line\n" if 2 <= $self->{'_debug'}; # Actual line of input: # Wizard Press Columns and Departments:Toychest!
# Sometimes the is on the next line. # Sometimes there is a /r right before the my ($sURL,$sTitle) = ($1,$2); # Ignore Infoseek-internal redirects (advertisements, etc.) unless ($sURL =~ m!^/redirect!i) { # hits from Companies database are internal www.Infoseek.com links: $sURL = 'http://www.infoseek.com'. $sURL if $sURL =~ m@^/Content@; if (defined($hit)) { push(@{$self->{cache}}, $hit); } $hit = new WWW::SearchResult; $hit->add_url($sURL); $self->{'_num_hits'}++; $hits_found++; $hit->title(strip_tags($sTitle)); $state = $DESC; } # unless } # old URL line elsif ($state eq $DESC && m|\(.*?)\$|) { print STDERR "old description line\n" if 2 <= $self->{'_debug'}; # Sometimes description is empty $hit->description(strip_tags($1)) if ref($hit); if ($hit->url =~ m/col=NX/) { # This a NEWS results page $state = $HITS; } else { $state = $HITS; } } # line is description elsif ($state eq $DESC && m|^(.+(\s\.\.?\.?)?)?\s \s \s*$|) { print STDERR "company description line\n" if 2 <= $self->{'_debug'}; # Sometimes description is empty $hit->description(strip_tags($1)) if ref($hit); $state = $HITS; } # line is description elsif ($state eq $HITS && m=(\d+)\%$=) { print STDERR "score line\n" if 2 <= $self->{'_debug'}; $hit->score($1) if ref($hit); $state = $HITS; } elsif ($state eq $HITS && m=\(Size\s([0-9.KM]+)\)=) { print STDERR "hit size line\n" if 2 <= $self->{'_debug'}; my $size = $1; $size =~ s/K/*1024/; $size =~ s/M/*1024*1024/; $hit->size(eval $size) if ref($hit); $state = $HITS; } elsif ($state eq $HITS && m=Date:$SPACE(\d+\s+[A-Z][a-z]+\s+\d+)=) { print STDERR "hit change_date line\n" if 2 <= $self->{'_debug'}; # Actual line of input is: # Document date: 22 Oct 1996

$hit->change_date($2) if ref($hit); $state = $HITS; } elsif ($state eq $HITS && m=^(\)?([a-zA-Z]+\s+\d+\s+[a-zA-Z]+\s+[\d:]+)(\)?=) { print STDERR "hit news date line\n" if 2 <= $self->{'_debug'}; # Actual lines of input include: # Document date: 22 Oct 1996

# Wed 19 Aug 13:38 $hit->change_date($2) if ref($hit); $state = $HITS; } else { print STDERR "didn't match\n" if 2 <= $self->{'_debug'}; } } # foreach line of query results HTML page if (ref($hit)) { push(@{$self->{cache}}, $hit); } return $hits_found; } # native_retrieve_some 1; __END__ Martin''s page download results, 1998-04: values of URL fields: st = starting result # (round down to multiple of 5?) nh = number of hits per page (round down to multiple of 5) rf = 0 means do not group results by site col = HV for search on companies col = WW for search on web col = NX for search on news default Companies search: http://www.infoseek.com/Titles?qt=cable+tv&col=HV%2Ckt_N%2Cak_corpdir&sv=IS&lk=noframes&nh=10 simple Companies search: http://www.infoseek.com/Titles?qt=cable+tv&col=HV&nh=10