#!/usr/bin/perl use strict; use LWP::UserAgent; use XML::RSS; my $ua = LWP::UserAgent->new; # Given a base pipermail URL (including the YYYY-Month), return the threads sub get_threads { my ($url) = @_; my $response = $ua->get("$url/thread.html"); if($response->is_success) { my $content = $response->content; # OK, so now we have the content. It just so happens that all the links # we're looking for are inside of capitolized LI tags, and run to the end # of the line. Lucky us! my @messages = ($content =~ /
  • (.*)/g); # Now we'll do some cleanul -- add in the closing and remove the list # subject indicator @messages = map { $_ . "" } @messages; @messages = map { s/\[Phoenix-pm\] //; $_ } @messages; # Now we'll create a list of threads. Actually we'll stick them into a # hash so that we get a unique list of subjects. We'll keep the URL and # bare subject for later use my %thread; foreach my $msg (@messages) { my ($msg_url, $topic) = $msg =~ /(.*?)<\/A>/; $msg_url = "$url/$msg_url"; $thread{$topic} = [$msg_url, $topic] unless $thread{$msg}; } # Thats it! Now all of our [$url, $topic] pairs are the values of %thread my @threads = values %thread; # The threads are all mixed up from keeping them in a hash. Sort them by # URL, which happens to also give us most-recent-first order we like @threads = reverse sort { $a->[0] cmp $b->[0] } @threads; return @threads; } else { # Well, since we didn't get the page correctly lets just DIE! die $response->status_line; } } # This will help us generate the URL based on the current day my @months = qw( January February March April May June July August September October November December ); my $base_url = 'http://mail.pm.org/pipermail/phoenix-pm'; my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime; # Find the threads for the current month, and if there aren't enough go back # to last month, and so on my @threads; my $offset = 0; while(scalar @threads < 15) { my $month = ($year + 1900) . '-' . $months[$mon]; push @threads, get_threads("$base_url/$month"); $mon--; if($mon < 0) { $mon = 11; $year--; } } # Now we've got all of our threads, we just spit out some RSS my $rss = new XML::RSS; $rss->channel( title => 'Phoenix.PM Mailing List Topics', detail => 'Scraped topic list for Phoenix.PM'); foreach my $thread (@threads) { my ($url, $topic) = @$thread; $rss->add_item( link => $url, title => $topic); } print $rss->as_string;