# Lycos.pm # by Wm. L. Scheding and Martin Thurn # Copyright (C) 1996-1998 by USC/ISI # $Id: Lycos.pm,v 1.18 2000/06/15 19:39:27 mthurn Exp $ =head1 NAME WWW::Search::Lycos - class for searching Lycos =head1 SYNOPSIS use WWW::Search; my $oSearch = new WWW::Search('Lycos'); my $sQuery = WWW::Search::escape_query("+sushi restaurant +Columbus Ohio"); $oSearch->native_query($sQuery); while (my $oResult = $oSearch->next_result()) print $oResult->url, "\n"; =head1 DESCRIPTION This class is a Lycos specialization of L. It handles making and interpreting Lycos-site searches F. This class exports no public interface; all interaction should be done through L objects. =head1 NOTES www.lycos.com is sometimes slow to respond; but I have not had a problem with the default timeout. www.lycos.com does not give the score, date, nor size of the pages at the resulting URLs; therefore change_date(), score(), and size() will never have a value. =head1 SEE ALSO To make new back-ends, see L. =head1 BUGS Please tell the author if you find any! =head1 TESTING This module adheres to the WWW::Search test mechanism. See $TEST_CASES below. =head1 AUTHOR As of 1998-12-07, C is maintained by Martin Thurn (MartinThurn@iname.com). C was originally written by Martin Thurn, based on C version 1.12 of 1998-10-22. =head1 LEGALESE THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. =head1 VERSION HISTORY If it is not listed here, then it was not a meaningful nor released revision. =head2 2.11, 2000-06-15 new method gui_query, and parse new output format =head2 2.09, 1999-12-26 output format fixes, and query string changes for searching Lycos.com (dbradford@bdctechnologies.com) =head2 2.08, 1999-12-22 point to new path on lycos.com (thanks to David Bradford dbradford@bdctechnologies.com) =head2 2.07, 1999-12-10 more output format fixes, and missing 'next' link for Sites =head2 2.05, 1999-12-03 handle new url and new output format for Lycos::Sites.pm =head2 2.04, 1999-10-22 use strip_tags(); extract real URL from www.lycos.com's redirection URL =head2 2.03, 1999-10-05 now uses hash_to_cgi_string() =head2 2.02, 1999-09-30 Now able to get Web Sites results via child module Sites.pm =head2 2.01, 1999-07-13 =head2 1.04, 1999-04-30 Now uses lycos.com's advanced query format. =head2 1.02, 1998-12-10 First public release after being adopted by Martin Thurn. =cut ##################################################################### package WWW::Search::Lycos; require Exporter; @EXPORT = qw(); @EXPORT_OK = qw(); @ISA = qw(WWW::Search Exporter); $VERSION = '2.11'; $MAINTAINER = 'Martin Thurn '; use Carp (); use WWW::Search(qw( generic_option strip_tags unescape_query )); require WWW::SearchResult; use URI::Escape; sub gui_query { my ($self, $sQuery, $rh) = @_; $self->{'_options'} = { 'search_url' => 'http://www.lycos.com/srch/', 'query' => $sQuery, 'lpv' => 1, 'loc' => 'searchhp', }; return $self->native_query($sQuery, $rh); } # gui_query sub native_setup_search { my ($self, $native_query, $native_options_ref) = @_; $self->{_debug} = $native_options_ref->{'search_debug'}; $self->{_debug} = 2 if ($native_options_ref->{'search_parse_debug'}); $self->{_debug} = 0 if (!defined($self->{_debug})); # During Sites searches, lycos.com returns 10 hits per page no # matter what. my $DEFAULT_HITS_PER_PAGE = 100; $DEFAULT_HITS_PER_PAGE = 20 if 0 < $self->{_debug}; $self->{'_hits_per_page'} ||= 0; $self->{'_hits_per_page'} = $DEFAULT_HITS_PER_PAGE unless 0 < $self->{'_hits_per_page'}; $self->{agent_e_mail} = 'MartinThurn@iname.com'; $self->user_agent('non-robot'); $self->{_next_to_retrieve} = 1; $self->{'_num_hits'} = 0; # The default search uses lycos.com's Advanced Search mechanism: if (!defined($self->{_options})) { $self->{'search_base_url'} = 'http://lycospro.lycos.com'; # ------------------------------------------------------------------------- # Modifications: 12/26/99 by dbradford@bdctechnologies.com # New Query String: As of 12/26/99, here is what it looks like querying for "linux" # Query String: http://lycospro.lycos.com/srchpro/?loc=searchhp&lpv=1&query=linux&t=all&type=websites # Removed these options: # 'maxhits' => $self->{_hits_per_page}, # 'matchmode' => 'or', # 'cat' => 'lycos', # 'mtemp' => 'nojava', # 'adv' => 1, # # Added these one in: # 'lpv' => '1', # 'loc' => 'searchhp', # 'type' => 'websites', # 't' => 'all', # 'query' => $native_query, # ------------------------------------------------------------------------- $self->{_options} = { 'search_url' => $self->{'search_base_url'} .'/srchpro/', 'lpv' => '1', 'loc' => 'searchhp', 'type' => 'websites', 't' => 'all', 'query' => $native_query, }; } # if $self->{_options}->{'query'} = $native_query; my $options_ref = $self->{_options}; # Copy in options which were passed in our second argument: if (defined($native_options_ref)) { foreach (keys %$native_options_ref) { $options_ref->{$_} = $native_options_ref->{$_}; } # foreach } # if # Copy in options which were set by a child object: if (defined($self->{'_child_options'})) { foreach (keys %{$self->{'_child_options'}}) { $self->{'_options'}->{$_} = $self->{'_child_options'}->{$_}; } # foreach } # if # Finally figure out the url. $self->{_next_url} = $self->{_options}{'search_url'} .'?'. $self->hash_to_cgi_string($self->{_options}); } # native_setup_search # private sub native_retrieve_some { my ($self) = @_; print STDERR " * Lycos::native_retrieve_some()\n" if $self->{_debug}; # Fast exit if already done: return undef if (!defined($self->{_next_url})); my $sCurrURL = $self->{_next_url}; # If this is not the first page of results, sleep so as to not # overload the server: $self->user_agent_delay if 1 < $self->{'_next_to_retrieve'}; # Get some: print STDERR " * sending request ($sCurrURL)\n" if $self->{_debug}; # print STDERR " * sending request ($sCurrURL)\n"; my($response) = $self->http_request('GET', $sCurrURL); $self->{response} = $response; if (!$response->is_success) { return undef; } $self->{'_next_url'} = undef; $self->{'_next_to_retrieve'} += $self->{'_hits_per_page'}; print STDERR " * got response\n" if $self->{_debug}; # parse the output my ($HEADER, $HITS, $DESC, $COLLECT, $SKIP1, $SKIP2) = qw( HE HI DE CO K1 K2 ); my $hits_found = 0; my $state = $HEADER; my $cite = ""; my $hit; my $description = ''; LINE_OF_INPUT: foreach ($self->split_lines($response->content())) { next LINE_OF_INPUT if m@^$@; # short circuit for blank lines print STDERR " * $state ===$_=== " if 2 <= $self->{'_debug'}; if ($state eq $HEADER && m|^Web\sSites|i) { # Actual line of input: # Web Sites $state = $SKIP2; } elsif ($state eq $SKIP1 && m|^Featured On Lycos$|) { $state = $SKIP2; } elsif ($state eq $SKIP2 && m|end vertical partner offers|) { # $state = $HITS; } elsif ($state eq $HEADER && m|Web\sPages|i) { # Actual lines of input are: # Web Pages (37054) # Web Pages print STDERR " page list intro\n" if ($self->{_debug}); $self->approximate_result_count($1) if m=\((\d+)\)=; $state = $HITS; } # if elsif ($state eq $HEADER && m!([,\d]+)\s*Web\ssites?\s(?:were|was)\sfound!i) { # Actual line of input are: #   127 Web sites were found in a search of the complete Lycos Web catalog #   1 Web site was found in 0.0502 seconds during an Advanced Search of All the Web print STDERR " site list intro\n" if ($self->{_debug}); my $i = $1; $i =~ s/,//; $self->approximate_result_count($1); $state = $HITS; } # if elsif ($state eq $HITS && m@
  • ]+?)\"?>\s?(.*?)@i) { # Actual line of input is: #
  • WISHBONE - A SHORT BIO OF THE GREAT SHOW FOR CHILDREN 
    #

    Merchandise Collector Masks - Star Wars my ($url, $title) = ($1,$2); if ($url =~ m/dir\.lycos\.com/) { print STDERR "skip lycos category name\n" if 2 <= $self->{_debug}; next LINE_OF_INPUT; } # if print STDERR "hit url line\n" if 2 <= $self->{_debug}; if (defined($hit)) { push(@{$self->{cache}}, $hit); }; $hit = new WWW::SearchResult; $hit->add_url($url); $hits_found++; $hit->title(strip_tags($title)); if ($url =~ m=//news\.lycos\.com/=) { $state = $COLLECT; } else { $state = $DESC; } } elsif ($state eq $HITS && (m!Try searching for!i || m!Second opinion!i || m!\.doubleclick\.!)) { # Actual line of input is: #

    Second opinion? Try searching for "vjbrooks" at

    print STDERR "no next button\n" if 2 <= $self->{_debug}; last LINE_OF_INPUT; } elsif ($state eq $HITS && m!

    !i) { print STDERR " multi-result line, splitting...\n" if 2 <= $self->{_debug}; # Actual line of input is #

  • Martin Thurn's Index Page - Martin Thurn Why am I so busy? I am gainfully employed at TASC. I have a family including 3 beautiful children. I am a co-habitant with two immaculate cats. I am the editor of The Star Wars Collector,
    http://www.pitt.edu/~thurn/

  • Gasthof Post - St. Martin in Thurn, S. Martin in Badia, Pustertal, Val Pusteria, - Diese Web-Seite verwendet Frames. Frames werden von Ihrem Browser aber nicht unterstutzt.
    http://www.posta.suedtirol.com/

  • ... my @asHits = split /\074[Pp]\076/; HIT: foreach my $sHit (@asHits) { # Actual chunk of the line is: #
  • Prison, Police, rePression
    Society > Issues > Human Rights > Political Prisoners #
  • Prison Issues Desk
    Society > Issues > Human Rights > Political Prisoners #
  • Human Rights Watch Prison Project: Prison Conditions and the Treatment of Prisoners - Information on prison conditions around the world, international human rights standards applicable to prisoners, and prison-related activities of the U.N. and other organizations.
    Society > Issues > Crime and Justice > Prisons > Organizations print STDERR " + $state ===$sHit=== " if 2 <= $self->{'_debug'}; my ($iHit,$iPercent,$iBytes,$sURL,$sTitle,$sDesc,$sDate) = (0,0,0,'','','',''); $sURL = &strip_tags($1) if $sHit =~ m!target=\"?(.+?)[&\">]!; if ($sURL =~ m!\s(>|>)\s!) { # This link is a Lycos category; we have to get the URL from # somewhere else in the line: $sURL = &uri_unescape($1) if $sHit =~ m!target=(.+?)&!i; } $sTitle = &strip_tags($1) if $sHit =~ m!LCOSW\d+\">(.+?)
    !i; $sTitle = &strip_tags($1) if $sHit =~ m!LCOSW\d+\">(.+?)\s-\s!i; my $sLoc = &strip_tags($1) if $sHit =~ m!
    (.+?)!; $sDesc = &strip_tags($1) if $sHit =~ m!\s-\s(.+?)!; if ($sURL ne '') { if (ref($hit)) { push(@{$self->{cache}}, $hit); } # if $hit = new WWW::SearchResult; $hit->add_url(uri_unescape($sURL)); $hit->title($sTitle) if $sTitle ne ''; $hit->description($sDesc) if $sDesc ne ''; $hit->location($sLoc) if $sLoc ne ''; $hit->score($iPercent) if 0 < $iPercent; $hit->size($iBytes) if 0 < $iBytes; $hit->change_date($sDate) if $sDate ne ''; $self->{'_num_hits'}++; $hits_found++; print STDERR " OK\n" if 2 <= $self->{_debug}; } # if $URL else { print STDERR " CANNOT FIND URL\n" if 2 <= $self->{_debug}; } } # foreach } # ------------------------------------------------------------------------- # Modifications: 12/26/99 by dbradford@bdctechnologies.com # Fixed reg expression to handle the font tags Lycos has added to output. # Old reg expression wouldn't grab the title, new one fixes this of course. # I have kept the old and new below to show the difference. You may cut out this # code if you so desire. # Old: # (m@^(?:
  • |\s)?]+?)\"?\>(.*?)\s-\s(.*)$@i || # m@
  • ]+?)\"?\>(.*?) \(.*)$@i)) # New: # (m@^(?:
  • |\s)?.*?]+?)\"?\>(.*?)\s-\s(.*)$@i || # m@
  • .*?]+?)\"?\>(.*?) \(.*)$@i)) # ------------------------------------------------------------------------- elsif ($state eq $HITS && (m@^(?:
  • |\s)?.*?]+?)\"?\>(.*?)\s-\s(.*?)@i || m@
  • .*?]+?)\"?\>(.*?) \(.*?)@i)) { # Actual line of input is: #
  • CD Solutions Inc. CD-ROM, Replication, and Duplication page - CD Solutions makes CD-ROM production easy # The Star Wars Collectors Archive - An archive of Star Wars Collectibles.  

    # Wholesale Only! Pokemon, Magic, Star Wars, Star Trek. Sales to Qualified retail stores only! - Wholesale Sales to qualified retail outlets only. We are authorized distributors for Wizards of the Coast, Decipher and most other trading card game companies. We have Pokemon.  

    print STDERR "hit url+desc line\n" if 2 <= $self->{_debug}; my ($sURL, $sTitle, $sDesc) = ($1,$2,$3); $sURL = unescape_query($1) if $sURL =~ m/target=(.+?)&/i; if (defined($hit)) { push(@{$self->{cache}}, $hit); } $hit = new WWW::SearchResult; $hit->add_url($sURL); $hits_found++; $hit->title(strip_tags($sTitle)); $hit->description(strip_tags($sDesc)); } elsif ($state eq $COLLECT && m=^(.*?)
    =i) { print STDERR "end of description collection\n" if 2 <= $self->{_debug}; $description .= $1; $hit->description(strip_tags($description)); $description = ''; $state = $HITS; } elsif ($state eq $COLLECT) { print STDERR "description collection\n" if 2 <= $self->{_debug}; $description .= "$_ "; } elsif ($state eq $DESC && m=^
    $=) { # Actual line of input is: #
    print STDERR "lone BR line\n" if 2 <= $self->{_debug}; # Do not change state; try to get the description from the next line } elsif ($state eq $DESC && m=^$=) { # Actual line of input is: # print STDERR "lone TR TD line\n" if 2 <= $self->{_debug}; # This item has no description. $state = $HITS; } elsif ($state eq $DESC && m=^(.+)
    =i) { # Actual line of input is: # Star Wars logo   Merchandise Collector Masks - Star Wars   Emperor Palpatine Greedo Yoda Emperor Palpatine Gre
    http://scifireplicas.com/movies/masks.htm print STDERR "description line\n" if 2 <= $self->{_debug}; my $sDescription = $1; $hit->description(strip_tags($sDescription)) if defined($hit); $state = $HITS; } elsif ($state eq $HITS && m!]+)\"?\>(?:next|More Web Sites)!i) { # Actual line of input is: # [ 9,469 More Web Sites about thurn ] print STDERR "next line\n" if 2 <= $self->{_debug}; # There is a "next" button on this page, therefore there are # indeed more results for us to go after next time. my $sURL = $1; $self->{'_next_to_retrieve'} = $1 if $sURL =~ m/first=(\d+)/; $self->{'_next_url'} = URI->new_abs($sURL, $sCurrURL); print STDERR " * next URL is ", $self->{'_next_url'}, "\n" if 2 <= $self->{_debug}; # print STDERR " * next URL is ", $self->{'_next_url'}, "\n"; last LINE_OF_INPUT; } else { print STDERR "didn't match\n" if 2 <= $self->{_debug}; } } # foreach if (defined($hit)) { push(@{$self->{cache}}, $hit); } # if # Sometimes, the result page for a Lycos::Sites search has a DEAD # next link. Here is a hack to create our own _next_url based on # what WE know about the search so far: if (ref($self) =~ m/::Sites/ && $self->{'_next_to_retrieve'} <= $self->approximate_result_count ) { $self->{_options}->{'first'} = $self->{'_next_to_retrieve'}; $self->{_next_url} = $self->{_options}{'search_url'} .'?'. $self->hash_to_cgi_string($self->{_options}); } return $hits_found; } # native_retrieve_some 1; __END__ URL for GUI query: http://www.lycos.com/srch/?lpv=1&loc=searchhp&query=thurn