// AltavistaReader.java // import java.io.*; import java.net.*; import java.util.*; import gnu.regexp.*; /** * AltavistaReader parses a given stream until it is empty, * and returns an array of Hits. * * @author Andrew Walenstein */ public class AltavistaReader { static final int MAX_HITS_PER_PAGE = 10; /** * Empties the input, parsing the HTML return result contained therein, * and returns the salient contents. * * @param inStream an open stream of Altavista "text mode" results * @returns AltavistaPageContents information contained in page * @throws IOException if something goes wrong reading the HTML */ private static Vector parsePage ( Reader inStream ) throws IOException { RE lineStartRE = null; RE contentLineRE = null; RE statsLineRE = null; try { lineStartRE = new RE ( "^
([0-9]+). *([^<]+)
$" ); contentLineRE = new RE ( "^([^<]+)<" ); statsLineRE = new RE ( "^Last modified ([0-9]*-...-..) - page size ([0-9]+)K - in (\\w+)" ); } catch ( REException e ) { System.out.println ( "Exc: " + e ); } final BufferedReader bRead = new BufferedReader ( inStream ); final Vector hits = new Vector ( MAX_HITS_PER_PAGE ); int numHits = 0; String line; while ( (line = bRead.readLine()) != null ) { final REMatch part1 = lineStartRE.getMatch ( line ); if ( part1 == null ) continue; if ( (line = bRead.readLine()) == null ) break; final REMatch part2 = contentLineRE.getMatch ( line ); if ( part2 == null ) continue; if ( (line = bRead.readLine()) == null ) break; final REMatch part3 = statsLineRE.getMatch ( line ); if ( part3 == null ) continue; hits.addElement ( new Hit ( part1.toString(1), part1.toString(2), part1.toString(3), part2.toString(1), part3.toString(1), part3.toString(2), part3.toString(3) ) ); } inStream.close(); return hits; } /** * Given an Altavista query string and a maximum number of hits, * this function returns an array of hits returned from Altavista. * * @param query query string in Altavista format * @param maxHits maximum number of hits to return * @returns an array of Hits returned by Altavista, null if invalid * @throws IOException if querying error */ public static Hit[] getQuery ( String query, int maxHits ) throws IOException { if ( maxHits <= 0 ) return null; final String avURL = "http://altavista.digital.com/cgi-bin/query?"; final String formQuery = URLEncoder.encode ( query ); final String queryBase = avURL + "pg=q&q=" + formQuery; final String suffix = "&c9k"; final Vector[] hitCollect = new Vector[(int)(maxHits/MAX_HITS_PER_PAGE)+1]; int numPages = 0; int totHits = 0; while ( totHits < maxHits ) { try { URL queryURL; queryURL = new URL ( queryBase + "&stq=" + (int)(numPages*10) + suffix ); InputStream resultStream = queryURL.openStream(); hitCollect[numPages] = parsePage ( new InputStreamReader ( resultStream ) ); if ( hitCollect[numPages].size() <= 0 ) break; totHits += hitCollect[numPages].size(); numPages++; if ( totHits < MAX_HITS_PER_PAGE ) break; } catch ( MalformedURLException e ) { System.err.println ( "Bad URL formed: " + e.toString() ); } } Hit[] collect = new Hit[totHits]; for ( int i = 0, hitNum = 0 ; i < numPages ; ++i ) { for ( Enumeration e = hitCollect[i].elements() ; e.hasMoreElements() ; ) { collect[hitNum++] = (Hit)e.nextElement(); } } return collect; } public static void main ( String[] args ) { for ( int i = 0 ; i < args.length ; ++i ) { try { Hit[] result = getQuery ( args[i], 10 ); for ( int q = 0 ; q < result.length ; ++q ) { Hit r = result[q]; System.out.println ( "\nRecord #" + r.number ); System.out.println ( "URL: " + r.theURL ); if ( r.title.length() < 50 ) { System.out.println ( "Title: " + r.title ); } else { System.out.println ( "Title: " + r.title.substring(0,49)); } if ( r.body.length() < 50 ) { System.out.println ( "Body: " + r.body ); } else { System.out.println ( "Body: " + r.body.substring(0,49) ); } System.out.println ( "Size(K):" + r.sizeK ); System.out.println ( "Lang: " + r.language ); } } catch ( IOException e ) { System.err.println ( "IO " + e.toString() ); } } } }