So this week I made a java script that asks for a url and scrapes it for all the words while outputting a text file with the words and word count.  I also worked on a more complicated text visualization using arcs and colors and whatnot.

Web Crawler+Concordance Code:
// Part 1: Front End Script that asks for the url
// Web Crawler driver program

package crawler;

import java.io.IOException;

//import a2z.A2ZFileWriter;
import a2z.ConsoleReader;
//import crawler.DrawBot;

public class Blm272_Week5 {

public static void main (String args[]) throws IOException {

ConsoleReader console = new ConsoleReader(System.in);
System.out.println(“Enter a url for input: “);
System.out.print(“%: “);
String url = console.readLine();
// A URL to start wtih
//String url = “http://itp.nyu.edu/blogblender”;

// Create a crawler object
DrawCrawler crawler = new DrawCrawler();

// Put the URL into the crawler object
crawler.addUrl(url);

// Since this crawler isn’t particularly polite: http://www.robotstxt.org/wc/guidelines.html
// I’m limited it to viewing 100 url requests
int count = 0; int limit = 0;

// Start crawling! (this should likely be its own thread.)
while (!crawler.queueEmpty()) {
crawler.crawl();
count++;
if (count > limit) break;
}

// If we end up here, there must be nothing left to crawl.
if (count < limit) System.out.println(“I ran out of websites to crawl.”);
else System.out.println(“I’m tired and don’t feel like crawling anymore.”);

}

}

————

//Part 2: The Edited Crawler (DrawCrawler) that outputs the concordance file
//Simple example of a web crawler
//URL queue: linked list
//Sites already visited: hash table

//Needs to be updated to comply with ROBOTS.TXT!

package crawler;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
import java.util.regex.*;

import processing.core.PApplet;
import processing.core.PFont;

import concordance.treemap.Word;

import a2z.A2ZFileReader;
import a2z.A2ZFileWriter;
import a2z.A2ZUrlReader;
import a2z.ConsoleReader;

public class DrawCrawler {

private LinkedList urlsToVisit;    // A queue of URLs to visit
private HashMap urlsVisited;        // A table of already visited URLs
private Pattern href;                        // A Pattern to match an href tag
private String ignore;                    // To be used as a regex for ignoring media files (JPG,MOV, etc.)

public DrawCrawler() {
urlsToVisit = new LinkedList();
urlsVisited = new HashMap();

// Match URLs
// Note using Pattern.COMMENTS flag which ignores white spaces and anything after ‘#’ in a regex
href = Pattern.compile(
“href                    # match href n” +
“\s*=\s*”            # 0 or more spaces, =, 0 ore more spaces, quote n” +
“(http[^"\s]*)        # capture the URL itself, http followed by no spaces and no quotes n” +
“”                        # ending with a quote n”,

Pattern.CASE_INSENSITIVE | Pattern.COMMENTS);

// We will ignore URLs ending with certain extensions
ignore = “.*(mov|jpg|gif|pdf)$”;
}

// A method to determine if the queue is empty or not
public boolean queueEmpty() {
return urlsToVisit.isEmpty();
}

// A method to crawl one URL
public void crawl() {
// Get a URL in the queue
String urlpath = (String) urlsToVisit.removeFirst();
// Read that URL
read(urlpath);
}

public void addUrl(String urlpath) {
// We can filter out certain kinds of URLs here
// In this example, our only critera is
// it matches the ignore pattern
if (!urlpath.matches(ignore)) {
// Add it to both the LinkedList and the HashMap
urlsToVisit.add(urlpath);
urlsVisited.put(urlpath,urlpath);
}
}

public void process(String content, String urlpath) throws IOException {
// This example just searches for web sites with the word “ITP” in them
// We could throw all the words into a concordance here
// Or implement some other type of storage / analysis algorithm
// Trivial example, searching for ITP
//Pattern p = Pattern.compile(“itp”,Pattern.CASE_INSENSITIVE);
//Matcher m = p.matcher(content);
//if (m.find()) {
//System.out.println(urlpath + ” contains ITP!”);

TreeMap words = new TreeMap();

// Step 3, break input file up into words
// We are doing this with split and a regular expression
String regex = “\b”;
String tokens[] = content.split(regex);

// We’ll use a regular exrpession to match words with only characters and apostrophes
// Throwing away all the punctuation (we could do this with a different split regex too)
Pattern p = Pattern.compile(“[a-z']+”,Pattern.CASE_INSENSITIVE);

// For every word
for (int i = 0; i < tokens.length; i++)
{
String s = tokens[i].toLowerCase();
// If it matches our regex, insert it in the tree
Matcher m = p.matcher(s);
if (m.matches()) {
if (words.containsKey(s)) {
Word w = (Word) words.get(s);
w.count();
} else {
Word w = new Word(s);
words.put(s,w);
}
}
}

// We’re done, print out contents of Tree!
System.out.println(“Here are the contents of your tree:”);
A2ZFileWriter fw = new A2ZFileWriter(“test.txt”);
Iterator iterator = words.values().iterator();
while (iterator.hasNext()) {
Word word = (Word) iterator.next();
System.out.println(word.getWord() + ” ” + word.getCount());
//System.out.println(word.getWord());
//String whatever = (word.getWord() + ” ” + word.getCount());
//fw.append(whatever);
fw.append(word.getWord() + ” ” + word.getCount());
fw.append(” “);
fw.append(“r”);

}
fw.close();
System.out.println(“Total words: ” + words.size());

//float elapsed = (System.currentTimeMillis() – thetime) / 1000.0f;;
//System.out.println(“Finished.  Concordance time: ” + elapsed + ” seconds.”);

}

// A method to read a URL and look for other URLs
// (and possibly do some sort of analysis)
public void read(String urlpath) {

System.out.println(urlsToVisit.size() + ” ” + urlpath);
try {
// Grab the URL content
A2ZUrlReader urlr = new A2ZUrlReader(urlpath);
String stuff = urlr.getContent();

process(stuff, urlpath);

// Match the URL pattern to the content
Matcher m = href.matcher(stuff);
// While there are URLs
while (m.find()) {
// Grab the captured part of the regex (the URLPath itself)
String newurl = m.group(1);
if (!urlsVisited.containsKey(newurl)) {
addUrl(newurl);
}
}
} catch (Exception e) {
// System.out.println(“Problem reading from ” + urlpath + ” ” + e);
// e.printStackTrace();
}

}
}

—–

The Updated Drawing Code:

/* Ben Leduc-Mills               */
/* Programming from A to Z       */
/* Simple Text Concordance and text cloud      */
/* Using a Java TreeMap          */
/* Based on code by Dan Shiffman */

package concordance.processing;

//import java.awt.Font;
//import java.awt.Graphics;
import java.io.*;
//import java.lang.reflect.Array;
import java.util.*;
import java.util.regex.*;
import a2z.*;
import processing.core.PApplet;
import processing.core.PFont;

public class DrawText extends PApplet
{
ArrayList words;
PFont f;

public void setup() {
size(1000,1000);
words = new ArrayList();
fillConcordance(“test.txt”);
f = createFont(“Georgia”,16,true);

}

public void draw() {
background(255);
// We’re done, print out contents of Tree!
// We’re done, print out contents of Tree!
//System.out.println(“Here are the contents of your tree:”);
Iterator iterator = words.iterator();
while (iterator.hasNext()) {
Word word = (Word) iterator.next();

String output = word.getWord();
int values = word.getCount();
int length = output.length();
//System.out.println(length);

/*String s = “Tokyo”;
float sw = textWidth(s);
text(s, 0, 85);
line(sw, 50, sw, 100);*/

if (values > 0 && values <3) {

values = values * length;
}
else {
values = values/length;
}
//if (values < 3) {
//  values = values *(values+2);
//}
//System.out.println(values);

fill(length*values);
textFont(f,values+length);
float width = textWidth(output);
System.out.println(width);
text(output, random(length*50,650), random(40,length*80));
//text (output, width*values, (5*length*values));
//line (width*20, values*20, length*40, values*40);

//circles
//fill(length*values+80);
stroke(length*values+80);
//strokeWeight((values+length)/4);
noFill();
//arc (width*10, length*10, values*10, values*10, 0, PI*2);
arc (width*values,4*length*values, values *10, values*10, 0, PI*2);
}
noLoop();
}

/*public void paint (Graphics g) {

System.out.println(“Here are the contents of your tree:”);
Iterator iterator = words.values().iterator();
while (iterator.hasNext()) {
Word word = (Word) iterator.next();

//System.out.println(word.getWord() + ” ” + word.getCount());
String output = ((word.getWord() + ” “.toString()));
//System.out.println(output);

Font font = new Font(“Dialog”, Font.PLAIN, 48);

g.setFont(font);
g.drawString(output, 100, 100);

}
noLoop();
} */

public void fillConcordance(String path) {
String[] lines = loadStrings(path);
for (int i = 0; i < lines.length; i++) {
String line = lines[i];
String[] stuff = line.split(” “);
Word w = new Word(stuff[0]);
w.setCount(Integer.parseInt(stuff[1]));
words.add(w);
}
}
}

Leave a Reply

You must be logged in to post a comment.