Combo Web Crawler & Concordance File Generator (Plus New Visualization!)
February 26, 2008
So this week I made a java script that asks for a url and scrapes it for all the words while outputting a text file with the words and word count. I also worked on a more complicated text visualization using arcs and colors and whatnot.
Web Crawler+Concordance Code:
// Part 1: Front End Script that asks for the url
// Web Crawler driver program
package crawler;
import java.io.IOException;
//import a2z.A2ZFileWriter;
import a2z.ConsoleReader;
//import crawler.DrawBot;
public class Blm272_Week5 {
public static void main (String args[]) throws IOException {
ConsoleReader console = new ConsoleReader(System.in);
System.out.println(“Enter a url for input: “);
System.out.print(“%: “);
String url = console.readLine();
// A URL to start wtih
//String url = “http://itp.nyu.edu/blogblender”;
// Create a crawler object
DrawCrawler crawler = new DrawCrawler();
// Put the URL into the crawler object
crawler.addUrl(url);
// Since this crawler isn’t particularly polite: http://www.robotstxt.org/wc/guidelines.html
// I’m limited it to viewing 100 url requests
int count = 0; int limit = 0;
// Start crawling! (this should likely be its own thread.)
while (!crawler.queueEmpty()) {
crawler.crawl();
count++;
if (count > limit) break;
}
// If we end up here, there must be nothing left to crawl.
if (count < limit) System.out.println(“I ran out of websites to crawl.”);
else System.out.println(“I’m tired and don’t feel like crawling anymore.”);
}
}
————
//Part 2: The Edited Crawler (DrawCrawler) that outputs the concordance file
//Simple example of a web crawler
//URL queue: linked list
//Sites already visited: hash table
//Needs to be updated to comply with ROBOTS.TXT!
package crawler;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
import java.util.regex.*;
import processing.core.PApplet;
import processing.core.PFont;
import concordance.treemap.Word;
import a2z.A2ZFileReader;
import a2z.A2ZFileWriter;
import a2z.A2ZUrlReader;
import a2z.ConsoleReader;
public class DrawCrawler {
private LinkedList urlsToVisit; // A queue of URLs to visit
private HashMap urlsVisited; // A table of already visited URLs
private Pattern href; // A Pattern to match an href tag
private String ignore; // To be used as a regex for ignoring media files (JPG,MOV, etc.)
public DrawCrawler() {
urlsToVisit = new LinkedList();
urlsVisited = new HashMap();
// Match URLs
// Note using Pattern.COMMENTS flag which ignores white spaces and anything after ‘#’ in a regex
href = Pattern.compile(
“href # match href n” +
“\s*=\s*” # 0 or more spaces, =, 0 ore more spaces, quote n” +
“(http[^"\s]*) # capture the URL itself, http followed by no spaces and no quotes n” +
“” # ending with a quote n”,
Pattern.CASE_INSENSITIVE | Pattern.COMMENTS);
// We will ignore URLs ending with certain extensions
ignore = “.*(mov|jpg|gif|pdf)$”;
}
// A method to determine if the queue is empty or not
public boolean queueEmpty() {
return urlsToVisit.isEmpty();
}
// A method to crawl one URL
public void crawl() {
// Get a URL in the queue
String urlpath = (String) urlsToVisit.removeFirst();
// Read that URL
read(urlpath);
}
public void addUrl(String urlpath) {
// We can filter out certain kinds of URLs here
// In this example, our only critera is
// it matches the ignore pattern
if (!urlpath.matches(ignore)) {
// Add it to both the LinkedList and the HashMap
urlsToVisit.add(urlpath);
urlsVisited.put(urlpath,urlpath);
}
}
public void process(String content, String urlpath) throws IOException {
// This example just searches for web sites with the word “ITP” in them
// We could throw all the words into a concordance here
// Or implement some other type of storage / analysis algorithm
// Trivial example, searching for ITP
//Pattern p = Pattern.compile(“itp”,Pattern.CASE_INSENSITIVE);
//Matcher m = p.matcher(content);
//if (m.find()) {
//System.out.println(urlpath + ” contains ITP!”);
TreeMap words = new TreeMap();
// Step 3, break input file up into words
// We are doing this with split and a regular expression
String regex = “\b”;
String tokens[] = content.split(regex);
// We’ll use a regular exrpession to match words with only characters and apostrophes
// Throwing away all the punctuation (we could do this with a different split regex too)
Pattern p = Pattern.compile(“[a-z']+”,Pattern.CASE_INSENSITIVE);
// For every word
for (int i = 0; i < tokens.length; i++)
{
String s = tokens[i].toLowerCase();
// If it matches our regex, insert it in the tree
Matcher m = p.matcher(s);
if (m.matches()) {
if (words.containsKey(s)) {
Word w = (Word) words.get(s);
w.count();
} else {
Word w = new Word(s);
words.put(s,w);
}
}
}
// We’re done, print out contents of Tree!
System.out.println(“Here are the contents of your tree:”);
A2ZFileWriter fw = new A2ZFileWriter(“test.txt”);
Iterator iterator = words.values().iterator();
while (iterator.hasNext()) {
Word word = (Word) iterator.next();
System.out.println(word.getWord() + ” ” + word.getCount());
//System.out.println(word.getWord());
//String whatever = (word.getWord() + ” ” + word.getCount());
//fw.append(whatever);
fw.append(word.getWord() + ” ” + word.getCount());
fw.append(” “);
fw.append(“r”);
}
fw.close();
System.out.println(“Total words: ” + words.size());
//float elapsed = (System.currentTimeMillis() – thetime) / 1000.0f;;
//System.out.println(“Finished. Concordance time: ” + elapsed + ” seconds.”);
}
// A method to read a URL and look for other URLs
// (and possibly do some sort of analysis)
public void read(String urlpath) {
System.out.println(urlsToVisit.size() + ” ” + urlpath);
try {
// Grab the URL content
A2ZUrlReader urlr = new A2ZUrlReader(urlpath);
String stuff = urlr.getContent();
process(stuff, urlpath);
// Match the URL pattern to the content
Matcher m = href.matcher(stuff);
// While there are URLs
while (m.find()) {
// Grab the captured part of the regex (the URLPath itself)
String newurl = m.group(1);
if (!urlsVisited.containsKey(newurl)) {
addUrl(newurl);
}
}
} catch (Exception e) {
// System.out.println(“Problem reading from ” + urlpath + ” ” + e);
// e.printStackTrace();
}
}
}
—–
The Updated Drawing Code:
/* Ben Leduc-Mills */
/* Programming from A to Z */
/* Simple Text Concordance and text cloud */
/* Using a Java TreeMap */
/* Based on code by Dan Shiffman */
package concordance.processing;
//import java.awt.Font;
//import java.awt.Graphics;
import java.io.*;
//import java.lang.reflect.Array;
import java.util.*;
import java.util.regex.*;
import a2z.*;
import processing.core.PApplet;
import processing.core.PFont;
public class DrawText extends PApplet
{
ArrayList words;
PFont f;
public void setup() {
size(1000,1000);
words = new ArrayList();
fillConcordance(“test.txt”);
f = createFont(“Georgia”,16,true);
}
public void draw() {
background(255);
// We’re done, print out contents of Tree!
// We’re done, print out contents of Tree!
//System.out.println(“Here are the contents of your tree:”);
Iterator iterator = words.iterator();
while (iterator.hasNext()) {
Word word = (Word) iterator.next();
String output = word.getWord();
int values = word.getCount();
int length = output.length();
//System.out.println(length);
/*String s = “Tokyo”;
float sw = textWidth(s);
text(s, 0, 85);
line(sw, 50, sw, 100);*/
if (values > 0 && values <3) {
values = values * length;
}
else {
values = values/length;
}
//if (values < 3) {
// values = values *(values+2);
//}
//System.out.println(values);
fill(length*values);
textFont(f,values+length);
float width = textWidth(output);
System.out.println(width);
text(output, random(length*50,650), random(40,length*80));
//text (output, width*values, (5*length*values));
//line (width*20, values*20, length*40, values*40);
//circles
//fill(length*values+80);
stroke(length*values+80);
//strokeWeight((values+length)/4);
noFill();
//arc (width*10, length*10, values*10, values*10, 0, PI*2);
arc (width*values,4*length*values, values *10, values*10, 0, PI*2);
}
noLoop();
}
/*public void paint (Graphics g) {
System.out.println(“Here are the contents of your tree:”);
Iterator iterator = words.values().iterator();
while (iterator.hasNext()) {
Word word = (Word) iterator.next();
//System.out.println(word.getWord() + ” ” + word.getCount());
String output = ((word.getWord() + ” “.toString()));
//System.out.println(output);
Font font = new Font(“Dialog”, Font.PLAIN, 48);
g.setFont(font);
g.drawString(output, 100, 100);
}
noLoop();
} */
public void fillConcordance(String path) {
String[] lines = loadStrings(path);
for (int i = 0; i < lines.length; i++) {
String line = lines[i];
String[] stuff = line.split(” “);
Word w = new Word(stuff[0]);
w.setCount(Integer.parseInt(stuff[1]));
words.add(w);
}
}
}
Hashtable word finder
February 19, 2008
So this week, in preparation for a Morse code translator and parser, I developed some code that takes in a file of gibberish characters (i.e. “afhefbufbeyioruoehfhajsofvvnmx”) and finds words by cross referencing the substrings with a hashtable of real words.
The idea is that I will find a way to turn tapping into Morse code into a string of characters, and from there this code will handle the job of finding words from them.
Here’s the code:
import java.util.*;
import java.io.IOException;
import a2z.A2ZFileReader;
public class Blm272_week4 {
public static void main(String[] args) throws IOException {
Hashtable dictionary = new Hashtable();
A2ZFileReader fr = new A2ZFileReader(“warandpeace.txt”); //Change to dictionary later
// Read the content and break up into words
String content = fr.getContent();
String regex = “\\b”;
String[] words = content.split(regex);
for (int i = 0; i < words.length; i++) {
if (!dictionary.containsKey(words[i])) {
//String result = words[i];
dictionary.put(words[i], words[i]);
}
}
A2ZFileReader fr2 = new A2ZFileReader(“stuff.txt”); //Morse Source
String word = fr2.getContent();
System.out.println(“Morse:” + word);
//String word = “andthenasIwasstandingthereamongthetreesInoticedahummingbirdinthedistance”;
for (int i = 0; i < word.length(); i++) {
for (int j = i+1; j <= word.length(); j++) {
String subword = word.substring(i,j);
if (dictionary.contains(subword)&&subword.length()>3) {
System.out.println(“found: ” + subword);
}
}
}
}
}
The Gestalt for the upcoming RH games page
February 10, 2008
For the RapHappy.com games page:
1. create handle
2. play a game
3. receive feedback (message, score, etc.)
If course, within the actual games the gestalt might be much more complex and varied, depending upon the game. But I think for any of the games to be meaningful, there must be persistence of identity (high scores become hard to track otherwise…), some sotr of actual gameplay interaction, and some sort of feedback to keep the user interested (scores, messages, whatever the gameplay imples).
Week 3 – Text Cloud with Concordance
February 10, 2008
This week I wanted to make a text cloud using java. I used the treemap iterator to get the words and word count from the input text, then plugged in the count to determine the font size and gave the words a semi-random position on the stage. I did a version using the processing libraries and one without. The commented sections represent the pure java version.
Here’s the code:
/* Ben Leduc-Mills */
/* Programming from A to Z */
/* Simple Text Concordance and text cloud */
/* Using a Java TreeMap */
/* Based on code by Dan Shiffman */
package concordance.processing;
//import java.awt.Font;
//import java.awt.Graphics;
import java.io.*;
//import java.lang.reflect.Array;
import java.util.*;
import a2z.*;
import java.util.regex.*;
import concordance.treemap.Word;
import processing.core.PApplet;
import processing.core.PFont;
public class blm272_week3 extends PApplet
{
TreeMap words;
PFont f;
public void setup() {
size(1000,1000);
fillConcordance(“obama.txt”);
f = createFont(“Georgia”,16,true);
}
public void draw() {
background(255);
// We’re done, print out contents of Tree!
// We’re done, print out contents of Tree!
System.out.println(“Here are the contents of your tree:”);
Iterator iterator = words.values().iterator();
while (iterator.hasNext()) {
Word word = (Word) iterator.next();
String output = word.getWord();
int values = word.getCount();
if (values < 3) {
values = values *(values+2);
}
//System.out.println(values);
fill(0);
textFont(f,values+5);
text(output, random(100,650), random(100,650));
}
noLoop();
}
/*public void paint (Graphics g) {
System.out.println(“Here are the contents of your tree:”);
Iterator iterator = words.values().iterator();
while (iterator.hasNext()) {
Word word = (Word) iterator.next();
//System.out.println(word.getWord() + ” ” + word.getCount());
String output = ((word.getWord() + ” “.toString()));
//System.out.println(output);
Font font = new Font(“Dialog”, Font.PLAIN, 48);
g.setFont(font);
g.drawString(output, 100, 100);
}
noLoop();
} */
public void fillConcordance(String path) {
try {
A2ZFileReader fr = new A2ZFileReader(path);
String content = fr.getContent();
// Step 2, create an empty Tree
words = new TreeMap();
// Step 3, break input file up into words
// We are doing this with split and a regular expression
String regex = “\\b”;
String tokens[] = content.split(regex);
// We’ll use a regular exrpession to match words with only characters and apostrophes
// Throwing away all the punctuation (we could do this with a different split regex too)
Pattern p = Pattern.compile(“[a-z']+”,Pattern.CASE_INSENSITIVE);
// For every word
for (int i = 0; i < tokens.length; i++)
{
String s = tokens[i].toLowerCase();
// If it matches our regex, insert it in the tree
Matcher m = p.matcher(s);
if (m.matches()) {
if (words.containsKey(s)) {
Word w = (Word) words.get(s);
w.count();
} else {
Word w = new Word(s);
words.put(s,w);
}
}
}
} catch (IOException e) {
System.out.println(“File I/O Error”);
e.printStackTrace();
}
}
}
Here’s a screen shot of the result on an Obama speech:
Week 2: Ceiling cat sez
February 2, 2008
This week, usng regular expressions, I continued my trouncing of the bible by taking every reference to ‘God’ and the word after it, and replacing them both with ‘Celing cat sez’. Hooray for lolcats.
Here’s the code:
// Word search and replace using regex
// Ben Leduc-Mills
// Programming A2Z, Spring 08
// Based off of code by Daniel Shiffman
import java.io.*;
import java.nio.*;
import java.nio.channels.*;
import java.util.regex.*;
public class Blm272Week2 {
public static void main (String[] args) throws IOException {
// Create an input stream and file channel
// Using first arguemnt as file name to read in
FileInputStream fis = new FileInputStream(args[0]);
FileChannel fc = fis.getChannel();
// Read the contents of a file into a ByteBuffer
ByteBuffer bb = ByteBuffer.allocate((int)fc.size());
fc.read(bb);
fc.close();
// Convert ByteBuffer to one long String
String content = new String(bb.array());
String[] words = content.split(“\\b”);
System.out.println(“Read ” + words.length + ” words from ” + args[0]);
StringBuffer reverse = new StringBuffer();
for (int i = 0; i < words.length; i++) {
String word = words[i];
reverse.append(word);
}
String result = reverse.toString();
String regex = “\\bGod\\b\\W+\\b(\\w+)\\b”; // Use any “non-word character” as a delimiter
String output = result.replaceAll(regex, “ceiling cat sez” );
System.out.println(result);
System.out.println(“running replace regex. . . “);
System.out.println(output);
// Create an output stream and file channel to write out a report
// (Also print out report to screen)
FileOutputStream fos = new FileOutputStream(args[1]);
FileChannel outfc = fos.getChannel();
// Convert content String into ByteBuffer and write out to file
bb = ByteBuffer.wrap(output.getBytes());
outfc.write(bb);
outfc.close();
//System.out.println(“Reversed text written to ” + args[1]);
}
}
Some of my favorites:
01:001:001 In the beginning ceiling cat sez the heaven and the earth.
01:001:003 And ceiling cat sez, Let there be light: and there was light.
01:001:022 And ceiling cat sez them, saying, Be fruitful, and multiply, and
fill the waters in the seas, and let fowl multiply in the
earth.
01:001:027 So ceiling cat sez man in his own image, in the image of ceiling cat sez he him; male and female created he them.
01:002:003 And ceiling cat sez the seventh day, and sanctified it: because
that in it he had rested from all his work which ceiling cat sez
and made.
01:002:007 And the LORD ceiling cat sez man of the dust of the ground, and
breathed into his nostrils the breath of life; and man became
a living soul.
01:002:008 And the LORD ceiling cat sez a garden eastward in Eden; and there
he put the man whom he had formed.
Anyway, you get the idea. Fun with lolcats.
