// AltavistaReader.java
//
import java.io.*;
import java.net.*;
import java.util.*;
import gnu.regexp.*;
/**
* AltavistaReader parses a given stream until it is empty,
* and returns an array of Hits.
*
* @author Andrew Walenstein
*/
public class AltavistaReader
{
static final int MAX_HITS_PER_PAGE = 10;
/**
* Empties the input, parsing the HTML return result contained therein,
* and returns the salient contents.
*
* @param inStream an open stream of Altavista "text mode" results
* @returns AltavistaPageContents information contained in page
* @throws IOException if something goes wrong reading the HTML
*/
private static
Vector parsePage ( Reader inStream ) throws IOException
{
RE lineStartRE = null;
RE contentLineRE = null;
RE statsLineRE = null;
try
{
lineStartRE = new RE
( "^
- ([0-9]+). *([^<]+)
- $" );
contentLineRE = new RE
( "^([^<]+)<" );
statsLineRE = new RE
( "^Last modified ([0-9]*-...-..) - page size ([0-9]+)K - in (\\w+)" );
}
catch ( REException e )
{
System.out.println ( "Exc: " + e );
}
final BufferedReader bRead = new BufferedReader ( inStream );
final Vector hits = new Vector ( MAX_HITS_PER_PAGE );
int numHits = 0;
String line;
while ( (line = bRead.readLine()) != null )
{
final REMatch part1 = lineStartRE.getMatch ( line );
if ( part1 == null ) continue;
if ( (line = bRead.readLine()) == null ) break;
final REMatch part2 = contentLineRE.getMatch ( line );
if ( part2 == null ) continue;
if ( (line = bRead.readLine()) == null ) break;
final REMatch part3 = statsLineRE.getMatch ( line );
if ( part3 == null ) continue;
hits.addElement ( new Hit ( part1.toString(1),
part1.toString(2),
part1.toString(3),
part2.toString(1),
part3.toString(1),
part3.toString(2),
part3.toString(3) ) );
}
inStream.close();
return hits;
}
/**
* Given an Altavista query string and a maximum number of hits,
* this function returns an array of hits returned from Altavista.
*
* @param query query string in Altavista format
* @param maxHits maximum number of hits to return
* @returns an array of Hits returned by Altavista, null if invalid
* @throws IOException if querying error
*/
public static
Hit[] getQuery ( String query, int maxHits ) throws IOException
{
if ( maxHits <= 0 ) return null;
final String avURL = "http://altavista.digital.com/cgi-bin/query?";
final String formQuery = URLEncoder.encode ( query );
final String queryBase = avURL + "pg=q&q=" + formQuery;
final String suffix = "&c9k";
final Vector[] hitCollect = new Vector[(int)(maxHits/MAX_HITS_PER_PAGE)+1];
int numPages = 0;
int totHits = 0;
while ( totHits < maxHits )
{
try
{
URL queryURL;
queryURL = new URL ( queryBase + "&stq=" + (int)(numPages*10) +
suffix );
InputStream resultStream = queryURL.openStream();
hitCollect[numPages] = parsePage ( new InputStreamReader ( resultStream ) );
if ( hitCollect[numPages].size() <= 0 ) break;
totHits += hitCollect[numPages].size();
numPages++;
if ( totHits < MAX_HITS_PER_PAGE ) break;
}
catch ( MalformedURLException e )
{
System.err.println ( "Bad URL formed: " + e.toString() );
}
}
Hit[] collect = new Hit[totHits];
for ( int i = 0, hitNum = 0 ; i < numPages ; ++i )
{
for ( Enumeration e = hitCollect[i].elements() ;
e.hasMoreElements() ; )
{
collect[hitNum++] = (Hit)e.nextElement();
}
}
return collect;
}
public static void main ( String[] args )
{
for ( int i = 0 ; i < args.length ; ++i )
{
try
{
Hit[] result = getQuery ( args[i], 10 );
for ( int q = 0 ; q < result.length ; ++q )
{
Hit r = result[q];
System.out.println ( "\nRecord #" + r.number );
System.out.println ( "URL: " + r.theURL );
if ( r.title.length() < 50 )
{
System.out.println ( "Title: " + r.title );
}
else
{
System.out.println ( "Title: " + r.title.substring(0,49));
}
if ( r.body.length() < 50 )
{
System.out.println ( "Body: " + r.body );
}
else
{
System.out.println ( "Body: " + r.body.substring(0,49) );
}
System.out.println ( "Size(K):" + r.sizeK );
System.out.println ( "Lang: " + r.language );
}
}
catch ( IOException e )
{
System.err.println ( "IO " + e.toString() );
}
}
}
}