# Excite.pm # by Martin Thurn # Copyright (C) 1998 by USC/ISI # $Id: Excite.pm,v 1.24 2000/06/19 14:47:03 mthurn Exp $ =head1 NAME WWW::Search::Excite - backend for searching www.excite.com =head1 SYNOPSIS use WWW::Search; my $oSearch = new WWW::Search('Excite'); my $sQuery = WWW::Search::escape_query("+sushi restaurant +Columbus Ohio"); $oSearch->native_query($sQuery); while (my $oResult = $oSearch->next_result()) { print $oResult->url, "\n"; } =head1 DESCRIPTION This class is a Excite specialization of WWW::Search. It handles making and interpreting Excite searches F. This class exports no public interface; all interaction should be done through L objects. =head1 NOTES www.excite.com does not report the approximate result count. =head1 SEE ALSO To make new back-ends, see L. =head1 CAVEATS Only returns results from Excite's "Web Results". Ignores all other sections of Excite's query results. =head1 BUGS Please tell the author if you find any! =head1 TESTING This module adheres to the C test suite mechanism. See the value of $TEST_CASES below. =head1 AUTHOR As of 1998-03-23, C is maintained by Martin Thurn (MartinThurn@iname.com). C was originally written by Martin Thurn based on C. =head1 LEGALESE THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. =head1 VERSION HISTORY =head2 2.07, 2000-03-29 BUGFIX for sometimes missing header (and getting NO results) =head2 2.06, 2000-03-02 BUGFIX for bungled next_url =head2 2.05, 2000-02-08 testing now uses WWW::Search::Test module; www.excite.com only allows (up to) 50 per page (and no odd numbers) =head2 2.04, 2000-01-28 www.excite.com changed their output format slightly =head2 2.03, 1999-10-20 www.excite.com changed their output format slightly; use strip_tags() on title and description results =head2 2.02, 1999-10-05 now uses hash_to_cgi_string() =head2 1.12, 1999-06-29 updated test cases =head2 1.10, 1999-06-11 fixed a BUG where returned URLs were garbled (maybe this was because www.excite.com changed their links) =head2 1.08, 1998-11-06 www.excite.com changed their output format slightly (thank you Jim (jsmyser@bigfoot.com) for pointing it out!) =head2 1.7, 1998-10-09 use new split_lines function =head2 1.5 \n changed to \012 for MacPerl compatibility =head2 1.4 Modified for new Excite output format. =head2 1.2 First publicly-released version. =cut ##################################################################### package WWW::Search::Excite; require Exporter; @EXPORT = qw(); @EXPORT_OK = qw(); @ISA = qw(WWW::Search Exporter); use Carp (); use WWW::Search qw( generic_option strip_tags ); require WWW::SearchResult; $VERSION = '2.09'; $MAINTAINER = 'Martin Thurn '; # private sub native_setup_search { my ($self, $native_query, $native_options_ref) = @_; # Set some private variables: $self->{_debug} = $native_options_ref->{'search_debug'}; $self->{_debug} = 2 if ($native_options_ref->{'search_parse_debug'}); $self->{_debug} ||= 0; my $DEFAULT_HITS_PER_PAGE = 50; # $DEFAULT_HITS_PER_PAGE = 30 if $self->{_debug}; $self->{'_hits_per_page'} = $DEFAULT_HITS_PER_PAGE; $self->{agent_e_mail} = 'MartinThurn@iname.com'; $self->user_agent(0); $self->{'_next_to_retrieve'} = 0; $self->{'_num_hits'} = 0; # Remove '*' at end of query terms within the user's query. If the # query string is not escaped (even though it's supposed to be), # change '* ' to ' ' at end of words and at the end of the string. # If the query string is escaped, change '%2A+' to '+' at end of # words and delete '%2A' at the end of the string. $native_query =~ s/(\w)\052\s/$1\040/g; $native_query =~ s/(\w)\052$/$1\040/g; $native_query =~ s/(\w)\0452A\053/$1\053/g; $native_query =~ s/(\w)\0452A$/$1/g; if (!defined($self->{_options})) { $self->{_options} = { 'search_url' => 'http://search.excite.com/search.gw', 'perPage' => $self->{'_hits_per_page'}, 'showSummary' => 'true', 'start' => $self->{'_next_to_retrieve'}, 's' => $native_query, 'c' => 'web', }; } # if my $options_ref = $self->{_options}; if (defined($native_options_ref)) { # Copy in new options. foreach (keys %$native_options_ref) { $options_ref->{$_} = $native_options_ref->{$_}; } # foreach } # if # Finally, figure out the url. $self->{_next_url} = $self->{_options}{'search_url'} .'?'. $self->hash_to_cgi_string($options_ref); } # native_setup_search # private sub native_retrieve_some { my ($self) = @_; # Fast exit if already done: return undef unless defined($self->{_next_url}); # If this is not the first page of results, sleep so as to not overload the server: $self->user_agent_delay if 1 < $self->{'_next_to_retrieve'}; # Get some results, adhering to the WWW::Search mechanism: print STDERR " * sending request (",$self->{_next_url},")\n" if $self->{'_debug'}; my $response = $self->http_request('GET', $self->{_next_url}); $self->{response} = $response; unless ($response->is_success) { return undef; } print STDERR " * got response\n" if $self->{'_debug'}; $self->{'_next_url'} = undef; # Parse the output my ($HEADER, $HITS, $URL, $DESC, $DESC2, $TRAILER, $SKIP1, $SKIP2, $SKIP3) = qw( HE HH UR DE D2 TR S1 S2 S3 ); my $hits_found = 0; my $state = $HEADER; my $hit; foreach ($self->split_lines($response->content())) { next if m/^$/; # short circuit for blank lines print STDERR " * $state ===$_===" if 2 <= $self->{'_debug'}; if ($state eq $HEADER && m=^\[(\d+)\s+hits.=) { # Actual line of input is: # [9000 hits. About Your Results] print STDERR "header line (first page)\n" if 2 <= $self->{'_debug'}; $self->approximate_result_count($1); $state = $HITS; } # we're in HEADER mode, and line has number of results elsif ($state eq $HEADER && (m=^\s*(?:\240| )?\d+-(\d+)\s*$= || m!\AWeb\sSite\sResults\s\d+-(\d+)\sfor:!)) { # Actual line of input is: # 11-20 # Web Site Results 1-22 for: +LSAM +replication print STDERR "header line (second/only page)\n" if 2 <= $self->{'_debug'}; unless (defined($self->approximate_result_count) and 0 < $self->approximate_result_count) { $self->approximate_result_count($1); } # unless $state = $HITS; } # we're in HEADER mode, and line has number of results elsif ($state eq $HEADER && m=^\s*Top\s+()?\d+()?\s*(Web\s+Site)?$=) { # Actual line of input is: # Top 30 # Top 50 Web Site print STDERR "header line (no count)\n" if 2 <= $self->{'_debug'}; $state = $HITS; } # we're in HEADER mode, and line has number of results elsif ($state eq $HITS && m=\(\d+)\%=i) { print STDERR "hit percentage line\n" if 2 <= $self->{'_debug'}; # Actual line of input: # 92% $hit->score($1); $state = $URL; } # in HITS mode, saw percentage line elsif ($state eq $HITS && m!\A(?:<(?:p|li)>\s*)?\([^\<]+)!i ) { print STDERR "hit url line\n" if 2 <= $self->{'_debug'}; # Actual lines of input: # Greedo  #

Charmeleon's Guestbook  # Sometimes the is on the next line. # Sometimes there is a /r right before the #

  • |-r a d i u m-|    The Half Life Map Center if (ref($hit) && $hit->url) { push(@{$self->{cache}}, $hit); } $hit = new WWW::SearchResult; $self->{'_num_hits'}++; $hits_found++; $hit->add_url($1); $hit->title(strip_tags($2)); $state = $DESC; $state = $SKIP1 if m!
  • !i; } elsif ($state eq $SKIP1) { print STDERR "skip1\n" if 2 <= $self->{'_debug'}; $state = $SKIP2; } elsif ($state eq $SKIP2) { print STDERR "skip2\n" if 2 <= $self->{'_debug'}; $state = $SKIP3; } elsif ($state eq $SKIP3) { print STDERR "skip3\n" if 2 <= $self->{'_debug'}; $state = $DESC2; $state = $HITS if m!
  • !; } elsif ($state eq $DESC2) { print STDERR "desc2\n" if 2 <= $self->{'_debug'}; $hit->description(strip_tags($_)); $state = $HITS; } elsif ($state eq $DESC && m/^
    $/) { print STDERR "no desc\n" if 2 <= $self->{'_debug'}; $state = $HITS; } elsif ($state eq $DESC && (m/^\-\s(.+?)
    / || m/^\-\s(.+)$/) ) { print STDERR "hit description line\n" if 2 <= $self->{'_debug'}; # Actual line of input: # - Bootlegs Maintained by Gus Lopez (lopez@halcyon.com) Bootlegs toys and other Star Wars collectibles were made primarily in countries where Star Wars was not commercially released in theaters. Most Star Wars bootlegs originate from the eastern bloc countries: Poland, Hungary, and Russia.
    http://www.toysrgus.com/images-bootleg.html # (The description ends when we see
    , or goes to end-of-line if there is no
    $hit->description(strip_tags($1)); $state = $HITS; } # line is description elsif ($state eq $HITS && m/]*VALUE=\"Next\sResults\"/i) { # Actual lines of input include: # # print STDERR " found next button\n" if 2 <= $self->{'_debug'}; # There is a "next" button on this page, therefore there are # indeed more results for us to go after next time. # Process the options. $self->{'_next_to_retrieve'} += $self->{'_hits_per_page'}; $self->{'_options'}{'start'} = $self->{'_next_to_retrieve'}; # Finally, figure out the url. $self->{_next_url} = $self->{_options}{'search_url'} .'?'. $self->hash_to_cgi_string($self->{_options}); $state = $TRAILER; } else { print STDERR "didn't match\n" if 2 <= $self->{'_debug'}; } } # foreach line of query results HTML page if ($state ne $TRAILER) { # End, no other pages (missed some tag somewhere along the line?) $self->{_next_url} = undef; } if (ref($hit)) { push(@{$self->{cache}}, $hit); } return $hits_found; } # native_retrieve_some 1; __END__ Martin''s page download notes, 1998-03: fields on advanced search page: c (select) search where: 'web','web.review','timely','web.de','web.fr','web.uk','web.se' FT_1 (select) 'w' the word(s) or 'p' the phrase for MAY contain FL_1 (hidden) '3' FI_1 (text) the search terms (MAY contain) FT_2 (select) 'w' the word(s) or 'p' the phrase for MUST contain FL_2 (hidden) '4' FI_2 (text) search terms (MUST NOT contain) FT_3 (select) 'w' the word(s) or 'p' the phrase for MUST NOT contain FL_3 (hidden) '2' FI_3 (text) search terms (MUST NOT contain) mode (hidden) 'advanced' numFields (hidden) '3' lk (hidden) 'default' sort (radio) 'relevance' or 'site' showSummary (select) 'true' titles & summaries or 'false' titles only perPage (select) '10','20','30','40','50' simplest pages, normal search: http://search.excite.com/search.gw?search=Martin+Thurn&start=0&showSummary=true&perPage=50 http://search.excite.com/search.gw?search=Martin+Thurn&start=150&showSummary=true&perPage=50 simplest first page, advanced search: http://search.excite.com/search.gw?c=web&FT_1=w&FI_1=Christie+Abbott&mode=advanced&numFields=3&sort=relevance&showSummary=true&perPage=50