#!/usr/bin/perl use strict; # Here we pull in our helper libraries use FileHandle; # This will let us keep the output up-to-date use Census; use Data::Dumper; # Autoflush (output) for immediate feedback $| = 1; # ************************ # *** TSV Manuipuation *** # ************************ # Given a fresh file, grab the first row which we will assume are our headers sub getHeaders { my ($file) = @_; my $line; if($line = <$file>) { chomp $line; $line =~ tr/\n\r//d; $line =~ s/\t+$//; my (@header) = split(/\t/, $line); return @header; } else { die "Header not found!"; } } # Given a file and our headers, grab the next line and split it into # name->value pairs sub getRowHash { my ($file, @headers) = @_; my $line; if($line = <$file>) { $line =~ tr/\n\r//d; $line =~ s/\t+$//; my %row; my (@entries) = split(/\t/, $line); for(my $i = 0; $i < $#headers + 1; $i++) { $row{$headers[$i]} = $entries[$i]; } return %row; } else { print "X\n"; # We didn't find a line, so we'll return nothingness return (); } } sub printRow { my ($file, @data) = @_; print $file (join "\t", @data) . "\n"; } # ****************** # *** Main Logic *** # ****************** sub main { my ($filename) = @_; $filename =~ s/\.dat$//; # Chop off .dat my %mem; my $geo; open($geo, "data/azgeo.csv"); while(<$geo>) { my ($tract, $block_group, $block, $id) = split(/,/,$_); chomp $id; $mem{$tract+0}{$block_group+0}{$block+0} = $id; #print "$tract $block_group $block -> $mem{$tract}{$block_group}{$block}\n"; print "$id\n" } my ($in, $cur, $out); open($in, "$filename.dat") or die "Error opening input: $!\n"; if(-e ($filename . "_out.dat")) { system('mv ' . $filename . '_out.dat ' . $filename . '_old.dat'); open($cur, $filename . "_old.dat") or die "Error opening old output: $!\n"; } open($out, ">" . $filename . "_out.dat") or die "Error opening output: $!\n"; $out->autoflush(1); # Make sure that as soon as we output something we save it # my (@newstuff) = qw( tract block_group block district ); my (@newstuff) = qw( LOGRECNO ); my $linenum = 0; my (@headers) = getHeaders($in); getHeaders($cur) if $cur; @headers = (@headers, @newstuff); printRow($out, @headers); my %row; while((%row) = getRowHash($in, @headers)) { $linenum++; if($cur) { my %cur_row = getRowHash($cur, @headers); undef $cur unless %cur_row; foreach (@newstuff) { $row{$_} = $cur_row{$_}; } } #if($row{LOGRECNO}) { if(0) { # } && $row{district} && $row{block_group} && $row{block}) { print "$linenum - Found cached version, skipping.\n"; my (@out) = map { $row{$_} } @headers; printRow($out, @out); } else { # This is what I want # my (%census_data) = Census::get($street); my $grp = $row{block_group}; my $tract = $row{tract}; $tract *= 100; print " --> $tract $grp $row{block}\n"; my (%census_data) = (LOGRECNO => $mem{$tract}{$grp}{$row{block}}); print "$linenum - " . join(',',%census_data) . "\n"; %row = ( %row, %census_data ); my (@out) = map { $row{$_} } @headers; printRow($out, @out); #print "Sleeping for 3 seconds...\r"; #sleep 3; } } } main(@ARGV); =pod # Here we go through, check to see that everything lines up, and also try the # rows that didn't work again (getting them from the census website) sub verify_and_reget { my ($in, $out, $new_out, %row); open($in, "input.dat") or die "Error opening input file!\n"; open($out, "output.csv"); open($new_out, ">new_out.csv"); my $linenum = 0; my (@headers) = getHeaders($in); while((%row) = getRowHash($in, @headers)) { last if(exists($row{''})); $linenum++; print "Working on line $linenum\r"; print "\n" unless $linenum % 1000; my $street = join(',',$row{StreetNo},$row{StreetDir}, $row{StreetName},$row{StreetType}); my $outline = <$out>; if($outline =~ /^$street,(.*)$/) { my $data = $1; if($data =~ /,,,/) { print "Found blank for $street\n"; my $str2 = $street; $str2 =~ s/,/ /g; my $content = get_content($str2); my ($tract, $block_group, $block, $district) = parse_content($str2,$content); print "$linenum\t"; print "Tract: $tract\t"; print "Block Group: $block_group\t"; print "Block: $block\t"; print "District: $district\n"; print $new_out "$street,$tract,$block_group,$block,$district\n"; # eh... who needs sleep? you're never gunna get it. # sleep 2; } else { print $new_out $outline; } } else { if(%row) { use Data::Dumper; print "DUMP: " . Dumper(\%row) . "\n"; die "Error on line $linenum -- No match! '$street', $outline"; } } } print "\nNo problemo!\n"; } # Now we call one of the main routines. For now we just switch by hand. #initial_get(); verify_and_reget(); =cut