package Census; use strict; use LWP::UserAgent; # This is our web-browser use URI::Escape; # This helps us put things into URL form use Data::Dumper; # Memoize some subs to a tied file use TiedMemoize; TiedMemoize::tied_memoize('Census::lookup_address'); TiedMemoize::tied_memoize('Census::get_logrecno'); # ******************************************** # *** Content grabbing and data extraction *** # ******************************************** # Set up the global UserAgent use vars qw( $ua $errors ); $ua = LWP::UserAgent->new; # Given a street address, go look it up on the census website and return the # entire page sub lookup_address { my ($street) = @_; #print "Looking up $street\n"; my $uri_street = uri_escape($street); my $url = "http://factfinder.census.gov/servlet/AGSGeoAddressServlet" ."?IS_ADDRESS_VALID=N&IS_GEO_FOUND=N&programYear=50%3A420" ."&street=$uri_street" ."&city=Phoenix" ."&states=Arizona" ."&_programYear=50&_treeId=420&_lang=en" ."&_stateSelectedFromDropDown=%5Bobject+HTMLSelectElement%5D"; my $response = $ua->get($url); unless($response->is_success) { warn $response->status_line; warn "Errors: $errors"; $errors++; die "Too many errors :(" if $errors > 10; } my $content = $response->content; return $content; } # Given a census website page, suck out just the interesting information sub parse_address_content { my ($content) = @_; if($content =~ /select name="_geo_id" size="8" class="listboxie">(.*?)<\/select/ms) { my $data = $1; $data =~ s/<\/?option.*?>//g; # Remove all the junk #print "DEBUG: " . Dumper($data); my (@extract) = $data =~ / Census\ Tract\ ([\d.]+)\D .*? Block\ Group\ ([\d.]+)\D .*? Block\ ([\d.]+)\D .*? :\ (.*?)\ Voting\ District /msx; #print "DEBUG: " . Dumper(\@extract); my @names = qw( tract block_group block district ); my %data; for(my $i = 0; $i < $#names + 1; $i++) { $data{$names[$i]} = $extract[$i] || ''; } return %data; } else { #print "Content not found :-|\n"; return (); } } =pod (%data) = address_to_block("123 E Nowhere") assumes Phoenix, AZ returns ( tract => $tract, block_group => $block_group, block => $block. district => $district ) =cut sub address_to_block { my ($street) = @_; my $content = lookup_address($street); my (%data) = parse_address_content($content); return (%data); } my $logrec_2000_filename = '/mnt/data/tmp/cdata/zips/azgeo.uf1'; # Important file in data: # Logical Record Number 18 7 # Tract 55 6 # Block Group 61 1 # Block 62 4 sub get_logrecno { my ($year, $tract, $blockgrp, $block) = @_; if($year != 2000) { die "I don't know how to look up logrecno for year '$year'!"; } $tract = sprintf "%.2f", $tract; $tract =~ s/\.//; my $search = sprintf "'%06.6s'\t'% 1.1s'\t'% 4.4s'",$tract,$blockgrp,$block; #print "SEARCH: $search\n"; open my $file, "<", $logrec_2000_filename or die "ERROR: $!\n"; my $line; while($line = <$file>) { my $line_logrecno = substr($line, 18, 7); my $line_tract = substr($line, 55, 6); my $line_blockgrp = substr($line, 61, 1); my $line_block = substr($line, 62, 4); my $line_val = "'$line_tract'\t'$line_blockgrp'\t'$line_block'"; #print "$line_logrecno\t$line_val\n"; if($search eq $line_val) { return $line_logrecno; } } print "Not found :(\n"; return undef; # undef indicates not-found } # Given a row of non-census data, try to get the address sub getAddress { my (%row) = @_; my ($street_no, $street_dir, $street_name, $street_type); $street_no = $row{STREET_NO} || $row{StreetNo} || die "\nDidn't find street number!\n"; $street_dir = $row{StreetDir} || die "\nDidn't find street direction!\n"; $street_name = $row{StreetName} || die "\nDidn't find street name!\n"; $street_type = $row{StreetType} || die "\nDidn't find street type!\n"; my $addr = "$street_no $street_dir $street_name $street_type"; $addr = uc($addr); $addr =~ s/\.//g; $addr =~ s/^\s*//; $addr =~ s/\s*$//; if(!$addr) { die "\nError! Couldn't figure out address!\n"; } return $addr; } 1;