exec perl -Sx $0 ${1+"$@"} # -*-perl-*- if 0; #!perl -w ## ## License ## # dda.pl is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2, or (at your option) # any later version. # dda.pl is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # You should have received a copy of the GNU General Public License # along with dda.pl; If not, write to the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =pod =head1 NAME dda.pl - data digging agent =head1 SYNOPSIS Data Digging Agent [DDA], searches for interesting data in newsgroups. Currently only match on Subject Lines is implemented. dda.pl --group=string [--subject=string] [--nntpserver=string] =head1 DESCRIPTION '--nntpserver'. DDA uses the NNTP protocol to retrieve information from a server. You have to specify the name of the server as a command line option. '--group' specifies which group on the nntpserver shall be used. The string specified here is interpreted as perl regexp, so you could simply use --group='comp.cad.cadence' which will on most servers only match comp\.cad\.cadence or use more sophisticated regexps. '--subject' After a successfully established connection to the server articles which contain the specified string as part of their subject line are stored in the current working directory. The string is interpreted as perl rexexps too. =head1 EXAMPLES dda.pl --group=comp.cad.cadence --subject=skill --nntserver=news This retrieves form the nntpserver 'news' all articles which contain the word 'skill' and are stored on the server in the group 'comp.cad.cadence'. This is all a bit boring if you want many multipart mime encoded messages on your harddisk you could try something like. dda.pl --group=bin.*mp3 --subject=led.*zeppelin --nntserver=news Which will make you known to your local news feed administrator. For reconstructing the binary data a tool named uudeview helps a lot. =head1 AUTHOR Frank Hartmann email: soundart@gmx.net =cut use strict; use lib '/user/hartmann/lib/perl5'; use News::NNTPClient; use FrankLib; use Data::Dumper; use Getopt::Long; use Text::Soundex; use vars qw ( %scriptOptions %word_index %s_hex %log_per_group $news $first $last $subject $nr $group $tmp $search_term $opt $filename $server $word $line @groups @result @lines @log @log2 @prefs ); %scriptOptions=(); %word_index=(); $filename = '~/.dda_active'; ## ## delete old log file ## if (-w 'dda.log') { unlink 'dda.log'; } &add_log ('dda.log', "DDA======================================================================\n"); &add_log ('dda.log', "DDA Data Digging Agent [DDA]\n"); &add_log ('dda.log', "DDA Options: @ARGV\n"); &add_log ('dda.log', 'DDA $Id: dda.pl,v 1.1 2002/03/19 08:45:58 FrankHartmann Exp mattzz $ ',"\n"); &add_log ('dda.log', "DDA======================================================================\n"); &GetOptions(\%scriptOptions, "group=s","subject=s@","nntpserver=s"); foreach $opt (qw /group/) { if (!exists $scriptOptions{$opt}) { print "please use:\n\npod2text $0\n\nto access Documentation!\n"; die ("--$opt=VALUE needed\n"); } } @groups =(); @log=(); ## ## connect to news Server ## $server = $scriptOptions{'nntpserver'} || $ENV{NNTPSERVER}; &add_log ('dda.log', "DDA Connecting to Newsserver $server....\n") ; $news = new News::NNTPClient($server,119); ## ## read active file from .dda_active or NNTPSERVER ## # from Perl Cookbook 7.3 $filename =~ s{ ^ ~ ( [^/]* ) } { $1 ? (getpwnam($1))[7] : ( $ENV{HOME} || $ENV{LOGDIR} || (getpwuid($>))[7] ) }ex; if (-r $filename) { &add_log ('dda.log', "DDA reading active file from $filename\n"); @groups = &read_file($filename); } else { @groups= $news->list('active'); &write_file ($filename,@groups); } &add_log ('dda.log', "DDA .............................ok\n"); foreach $tmp (@groups) { @result = split / /,$tmp ; $group =$result[0]; # restrict groups to groups in alt hierarchy if ($group =~ /$scriptOptions{'group'}/) { &add_log ('dda.log', "DDA======================================================================\n"); &add_log ('dda.log', "DDA Entering $group\n"); &add_log ('dda.log', "DDA======================================================================\n"); ($first,$last) = $news->group($group); if (!$news->ok()) { &add_log ('dda.log', "DDA dead group, skipping\n"); next; } if ($first > $last) { &add_log ('dda.log', "DDA confused Newsserver, skipping\n"); next; } #print $news->article($last); # all Subjects on Server @result = $news->xhdr('Subject',$first,$last); &add_log ('dda.log', "DDA $#result+1 Articles in group\n"); &append_file ('dda.log', @result); $log_per_group{$group} = [ @result ]; @lines =(); foreach $subject (@result) { foreach $search_term (@{$scriptOptions{'subject'}}) { if ($subject =~ /$search_term/i) { push @lines, $subject; } } } &add_log ('dda.log', "DDA $#lines+1 Articles matched\n"); foreach $subject (@result) { foreach $search_term (@{$scriptOptions{'subject'}}) { if ($subject =~ /$search_term/i) { if ($subject =~ /^([0-9]+)/) { $nr = $1; # check for expired Articles @lines = $news->article($nr); #print Dumper \@lines; if (-s "$group.$nr") { &add_log ('dda.log', "DDA retrieving $nr skipped, already present\n"); } else { if ($news->ok()) { &add_log ('dda.log', "DDA retrieving $nr\n"); &write_file ("$group.$nr",$news->article($nr)); &add_log ('dda.log', "DDA ......................ok\n"); } } } } } } } } #print @result; #print $news->help; ## ## create a sorted version of log file for overview purposes ## @log =&read_file('dda.log'); @lines =(); foreach $tmp (@log) { $tmp =~ /^DDA/ and next; @result = split / /,$tmp; &create_word_index(@result); $nr = shift @result; push @lines, join ' ',@result; } @result = sort @lines; &write_file('dda_sorted.log',@result); @lines=(); #print Dumper \%log_per_group; foreach $tmp (keys %log_per_group) { @log = @{ $log_per_group{$tmp} }; @log2 = (); foreach $line (@log) { @result = split / /,$line; $nr = shift @result; push @log2, join ' ',@result; } push @lines,"DDA======================================================================\n" ; push @lines,"DDA Entering $tmp\n"; push @lines,"DDA======================================================================\n" ; push @lines, sort @log2; } &write_file('dda_sorted_per_group.log',@lines); ## ## create word frequency list ## @result = &sort_word_list(%word_index); #print Dumper \@result; #print Dumper \%word_index; @lines=(); foreach $tmp (@result) { #print "top $word_index{$tmp} hits: $tmp \n"; push @lines, "top $word_index{$tmp} hits: $tmp \n"; } &write_file('dda_top.log',@lines); @prefs = (); $filename = "$ENV{HOME}/.dda_prefs"; if (-e $filename) { if (!&is_safe($filename)) { die "file $filename can be written by others than you please correct\n"; } } else { @lines=<<"END_OF_TEMPLATE"; ## ## search preference for dda.pl ## ## uncomment & change the following line ##@prefs = qw /search me/; @prefs = (); END_OF_TEMPLATE &add_log ('dda.log', "DDA Reading personal search preferences $filename\n"); &write_file($filename,@lines); } unless ($tmp = do $filename) { warn "couldn't parse $filename: $@" if $@; warn "couldn't do $filename:: $!" unless defined $tmp; warn "couldn't run $filename::" unless $tmp; } &add_log ('dda.log', "DDA Calculating Soundexs\n"); foreach $tmp (@prefs) { $s_hex{soundex ($tmp)} = 1; } @result=(); foreach $tmp (keys %word_index) { if (exists($s_hex{soundex ($tmp) })) { push @result, "Possible Match found $tmp\n"; } } &write_file('dda_prefs.log',@result); &add_log ('dda.log', "DDA have fun\n"); ###################################################################### # sub modules ###################################################################### use File::stat; ## ## from perl cookbook ## 8.17. Testing a File for Trustworthiness ## sub is_safe { my $path = shift; my $info = stat($path); return unless $info; # owner neither superuser nor me # the real uid is in stored in the $< variable if (($info->uid != 0) && ($info->uid != $<)) { return 0; } # check whether group or other can write file. # use 066 to detect either reading or writing if ($info->mode & 022) { # someone else can write this return 0 unless -d _; # non-directories aren't safe # but directories with the sticky bit (01000) are return 0 unless $info->mode & 01000; } return 1; } ## ## create a word count hit list ## sub create_word_index { my(@words)=@_; my($word); foreach $word (@words) { # clean $word $word = lc($word); for ($word) { s/^[^a-z]*//; s/[^a-z]*$//; } #$word =~ tr/[^a-z]//; next if (length ($word) <5); #print $word; if (! ($word eq '')) { $word_index {$word }++; } } } sub sort_word_list { my(%hash)=@_; my(@keys); @keys = sort { $hash{$b} <=> $hash{$a} } keys %hash; # and by value return @keys; }