#!/usr/bin/perl

#
#  This program is intended to alert the reader about followups in
#  newsgroups.  It is best used in conjunction with Gnus and the
#  functions found in ego-watch.el.

#  Sample use: Once in a while, I have a question for
#  comp.os.linux.hardware.  I don't often read that group, since I'm
#  mostly interested in answers to my own questions, not to others'.
#  After I post a question, I try to check a newsgroup for a few days
#  to see what answers turn up, but it's easy to forget.  It's also
#  easy to miss answers that come surprisingly long after the original
#  question.

# This program takes care of that.  When I post a question, I select
# the posted article in Gnus and hit F5.  This writes some data to the
# file .newswatch/newswatch.in, consisting of newsgroup name, header
# and match information, as well as an expiration date.  This is a
# text file, so you *could* write this data in the file by hand, but I
# doubt you want to.

# A few times a day, this program runs as a cron job, checking for
# matches for each of the entries.  For example, if I post a question,
# then I can set the program to check for every post with the same
# subject (as a substring) as my original question.  If one or more
# matches are found, the program sends an email to me, with a short
# synopsis of the match.  That way, no missed answers to my questions.

# You can also ask for matches on the References header instead of the
# subject header.  That way, if a thread is very long and most of it
# is dull, the program will only alert you to posts in which your
# message ID appears in the References field -- that is, posts which
# are directly down-thread of yours.

# If you're in Usenet stalking mode, you could ask the program to look
# for posts with a given From: header.  Any header at all will do, and
# the matching is done via regexp, allowing for quite sophisticated
# tweaking.

# To use: Change the variables $email_address and $srv_name below.
#
# Set up a crontab entry.  Here's mine (I check every hour, which is
# overkill, but it really doesn't take much of the server's time at
# all):
#
#  46 * * * * /home/jesse/bin/egowatch.pl
#
# The program can also be called on the command line, with -? printing
# some basic help info.  You should create a ~/.newswatch directory,
# and perhaps also an empty newswatch.in file if you want to just play
# with it for a moment.

# To do: At present,the program searches through the articles that
# have arrived since the previous run.  It would be nice if it also
# read .newsrc (or is it .newsrc.eld?) so that the user *doesn't*
# receive email for messages he's already seen.

# Credits and blame:
#
# This program was written by Jesse Hughes, jesseh@cs.kun.nl.  It may
# be freely modified, distributed, whatever, as long as the source
# file continues to acknowledge the contribution of the original
# author.

# Please email for any fixes, questions, complaints, suggestions, etc.
# I'm not much of a Perl hacker, so criticisms are welcome.

# Format of the watch file:
# newsgroup name:header:contents:expiration
#
# comment character "#"

use strict;
use App::Config;
use Text::CSV;
use Net::NNTP;
use Mail::Send;

my $csv = Text::CSV->new;
my %watch_list;
my $config_dir = $ENV{HOME}."/.newswatch";
my $time_file = ".egowatch.timestamp";

my %matches;

my $cfg;
my $watch_file="newswatch.in";
my @watch_list_output;
my @email_output;

####################################################################
#
# CHANGE the following two values, unless your name is Jesse Hughes.
#
####################################################################

my $srv_name = "nntp-srv.sci.kun.nl";
my $email_address = "jesseh\@cs.kun.nl";

####################################################################
#
# You DID change those two values, right?
#
####################################################################

my $nntp = Net::NNTP->new($srv_name);

if (!$nntp){
  die "NO nntp!  Maybe there is no connection to the server at ".$srv_name."?";
}


##########################################################################
#
# FILE I/O SECTION
#
##########################################################################

###
#
# watch_path
#
# Prepends $config_dir.  Prob'ly better ways to do this.
#
###
sub watch_path{
  my $fn = shift;
  return "$config_dir/$fn";
}

###
#
#  open_watch_file_for_input
#
#  Prepare to read watch_file.
#
###

sub open_watch_file_for_input{
  my $fullpath = watch_path($watch_file);

  open(WATCH_HANDLE, "<$fullpath") || 
    die("Could not open watch file $fullpath for input.");
}

####
#
# open_watch_file_for_output
#
# (The name is longer than the subroutine.)
#
####

sub open_watch_file_for_output{  
  my $fullpath = watch_path($watch_file);

  if ($cfg->debug){
    *WATCH_HANDLE = *STDOUT;
    print "\n\n  ********* New watch file ********* \n\n";
  }
  else {
    open(WATCH_HANDLE, ">$fullpath") || 
      die("Could not open watch file $fullpath for output.");
  }
}

####
#
#  read_watch_file
#
####
sub read_watch_file{
  while(<WATCH_HANDLE>){
    if (/^\s*#/ || !/\S/){        # skip comments
	@watch_list_output = (@watch_list_output,$_);
      } else {
	my ($group,$header,$subj,$expire) = parse_watch_record();
	add_watch_record($group,$header,$subj);

	if (!purge_record($expire)){
	  @watch_list_output = (@watch_list_output,$_);
	}
      }
  }
  close(WATCH_HANDLE);
}


####
#
# write_watch_list
#
####
sub write_watch_list {
  foreach (@watch_list_output){
    print WATCH_HANDLE $_;
  }
}

####
#
#  parse_watch_record
#
####

sub parse_watch_record{
  if ($csv->parse($_)) {
    my ($group,$header,$subj,$expire) = $csv->fields;
#    if ($header eq "Message-Id"){
#      $header = "References";}
    return ($group,$header,$subj,$expire);
  } else {
    my $err = $csv->error_input;
    die "parse() failed on argument: ", $err, "\n";
    exit;
  }
}

####
#
# write_email($subj, @body)
#
####

sub write_email{
  my $msg = new Mail::Send;
  my @body = @_;
  $msg->to($email_address);
  $msg->subject(shift(@body));

  # My mail server doesn't allow incoming messages from unknown hosts.
  # Thus, I have sendmail forge the from address so that the recipient
  # and sender are the same.
  my $fh = $msg->open("sendmail", "-f".$email_address);

  foreach (@body){
    printf $fh $_;
  }

  $fh->close;

}

####
#
# read_last_time_watched
#
####

sub read_last_time_watched{
  if ($cfg->get("fromtime")) {
    return $cfg->get("fromtime");
  }
  my $time_path = watch_path($time_file);

  if (stat($time_path)){
    return (stat($time_path))[9];
  }
  else{
    return 0;
  }
}

####
#
# write_last_time_watched
#
####

sub write_last_time_watched{
  my $time_path = watch_path($time_file);

  my $now = time;
  if ($cfg -> verbose){
    print "\n\nWriting $time_path at ",time_to_string($now),".\n";
  }
  if (!(utime $now,$now,$time_path)){
    open(NEW, "> $time_path")         or die "can't open $time_path: $!";
    close(NEW);
  }

}

####
#
# time_to_string
#
####

sub time_to_string{
  my $time = shift;
  my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) =
		     localtime($time);
  my $ret = sprintf("%02u/%02u/%02u %02u:%02u:%02u",
		    $mon,$mday,$year % 100,$hour,$min,$sec);
  return $ret;
}


####
#
# google (No longer used.)
#
####

sub google{
  my $id = shift;
  my $ret = "Google link: <URL:http://groups.google.com/groups?as_umsgid=".$id.">\n";
  return substr($ret,0,70)."\n              ".substr($ret,71);
}
####
#
# create_email_output
#
####

sub create_email_output{
  my $last_time = shift;

  @email_output=("Egowatch.pl output.\n\n",
		 "There are recent matches in your watch list.\n",
		 "\nThese entries arrived since ",time_to_string($last_time),
		 ".\n\n");
  while (my ($id,$hash_ref) = each %matches) {
    my $subj = @{$hash_ref}{'subject'};

    my $from = @{$hash_ref}{'from'};
    my $grp = @{$hash_ref}{'group'};
    my $date = @{$hash_ref}{'date'};
    my @excerpt = @{@{$hash_ref}{'excerpt'}};
    my ($trash,$id_stripped) = split('<',$id);
    ($id_stripped,$trash) = split('>',$id_stripped);

    my @greppers = @{$hash_ref->{'greppers'}};
    $greppers[0] = "Matches: ".$greppers[0]."\n";
    @greppers[1..scalar @greppers - 1] = map("         ".$_."\n",@greppers[1..scalar @greppers - 1]);

    @email_output = (@email_output,
		     "-----------------------------------------------------------\n\n",
		     "Subject: $subj",
		     "From: $from",
    		     "Newsgroups: $grp",
		     "Date: $date\n",
		     "Message id: <news:", $id_stripped,">\n\n",
		     @excerpt,
		     "\n",
		     @greppers,
		     "\nMessage id: <news:", $id_stripped,">\n".google($id_stripped)."\n\n");
  }
}

######################################################################
#
# END FILE I/O SECTION
#
######################################################################

######################################################################
#
# SEMI-GRACEFUL EXIT SECTION
#
######################################################################

####
#
# help
#
####

sub help {
  print "\nUsage: egowatch.pl [--time/-t time][--verbose/-v][--debug/-d][--help/-?].\n\n";
  print "Checks an NNTP server for recent posts matching a subject or reference.\n\n";
  print "--time       Override time file and search from time included.\n";
  print "--verbose    For verbose output to STDOUT.\n";
  print "--debug      For verbose output and data printed to the screen.\n";
  print "\nThis is a poorly documented hack.  Tough.\n";
  exit;
}


####
#
# panic
#
####

sub panic {
  print "Error: incorrect usage.\n\n";
  help;
  exit;
}

######################################################################
#
# END SEMI-GRACEFUL EXIT SECTION
#
######################################################################

######################################################################
#
# WATCH LIST CREATION SECTION
#
######################################################################

####
#
# purge_record
#
# return true if record is old.
#
####

sub purge_record{
  my $expire_date = shift;

  return $expire_date < time();
}

##
#
# add_watch_record($grp,$header,$search)
#
# 
# About the data structure.
#
#  %watch_list is an association of group names to "group hashes".
#
#  Each group hash entries for each header encountered: 
#  E.g.: (subjects, subj_hash)
#
#  The "$subj_list" is a list of search strings (for the
#  subjects/ids/whatever).
##

sub add_watch_record{
  my ($grp,$header,$subj) = @_;

  if (!$watch_list{$grp}){
    $watch_list{$grp} = {};
  }

#  my %grp_hash = %{$watch_list{$grp}};

  if ($watch_list{$grp}->{$header}){
    my $search_list_ref = $watch_list{$grp}->{$header};
    if ($search_list_ref){
      @{$search_list_ref} = (@{$search_list_ref},$subj);
    }
  }
  else{
    $watch_list{$grp}->{$header} = [$subj];
  }
}

######################################################################
#
# END WATCH LIST CREATION SECTION
#
######################################################################

######################################################################
#
# FIND AND RECORD MATCHES SECTION
#
######################################################################


####
#
# process_watch_list()
#
# This function is called when the hash, %watch_list, is filled.
#
# It is responsible for fetching the list of posts for each group,
# and then grepping that group for appropriate posts.
#
####

sub process_watch_list{
  my $matchflag = 0;
  my $last_time = read_last_time_watched();

  if ($cfg->verbose){
    print "Last checked: ",time_to_string($last_time),".\n";
  }

  while (my ($grp, $grp_hash_ref) = each %watch_list){
    ## $grp is the name of the group.
    ## $grp_hash_ref is the reference of the hash for the message_ids and
    ## subjects.

    $nntp->group($grp);
    my $grep_str = grep_str($grp_hash_ref);
    my @posts_list = @{$nntp->newnews ( $last_time )};

    if ($cfg->verbose){
      print "\n\nChecking $grp.\n Grepping for $grep_str.\n\n";
    }

    foreach my $id (@posts_list){
      if ($cfg->verbose){
	print "$id\n";
      }

      my $grep_result = header_grep($id,$grep_str);
      if ($grep_result){
	if ($cfg -> verbose){
	  print "\n\n\nMatch! $id\n";
	}
	record_match($id,$grp_hash_ref);
      }
      $matchflag = $matchflag || $grep_result;
    }
  }

  write_last_time_watched();

  if ($matchflag){
    create_email_output($last_time);
  }
  
  return $matchflag;
}


####
#
# grep_str
#
# create grep string.
#
####

sub grep_str{
  my $hash_ref = shift;
  my %grp_hash = %{$hash_ref};
  my @str_ar=();

  while (my ($header,$search_list_ref) = each %grp_hash){
    my @search_list = @{$search_list_ref};
    @str_ar = (@str_ar,"^$header: .*(".join('|',@search_list).")");
  }
  return(join("|",@str_ar));
}

####
#
# record_match(id,grp_hash_ref)
#
####
sub record_match{
  my $id = shift;
  my $grp_hash_ref = shift;
  my %grp_hash = %{$grp_hash_ref};
  my @term_list = ();
  my %saved_headers;

  my $head_ref = $nntp -> head($id);
  my @headers = @{$head_ref};

  if (!$matches{$id}) {

    my @head_matches = grep(/^Subject: /, @headers);
    my ($trash, $val) = split /^Subject: /,$head_matches[0];
    $saved_headers{'subject'} = $val;

    @head_matches = grep(/^Newsgroups: /, @headers);
    ($trash, $val) = split /^Newsgroups: /,$head_matches[0];
    $saved_headers{'group'} = $val;

    @head_matches = grep(/^From: /, @headers);
    ($trash, $val) = split /^From: /,$head_matches[0];
    $saved_headers{'from'} = $val;

    @head_matches = grep(/^Date: /, @headers);
    ($trash, $val) = split /^Date: /,$head_matches[0];
    $saved_headers{'date'} = $val;
	
    $saved_headers{'excerpt'} = excerpt_body($id);

  }
  else {
    %saved_headers = %{$matches{$id}};
  }

  while (my ($header,$search_list_ref) = each %grp_hash) {
    my @search_list = @{$search_list_ref};

    foreach my $term (@search_list){
      my $s = "^$header: .*$term";
      if (grep(/$s/,@headers)){
	@term_list = (@term_list,$s);
      }
    }
  }
  $saved_headers{'greppers'} = \@term_list;

  $matches{$id} = \%saved_headers;
}


####
#
# excerpt_body
#
# create array of lines from body of match.
#
####

sub excerpt_body{
  my $id = shift;
  my $body_ref = $nntp->body($id);
  my @excerpted = ();

  if (defined($body_ref)){
    my @orig_body = @{$body_ref};

    while ((scalar(@excerpted) < 20) && (my $line = shift(@orig_body))){
 
      if ($line =~ /^[>:]/){
	while (($line =~ /^[>:]|^-- /) && ($line = shift(@orig_body))){}
	
	if ( (scalar(@excerpted) > 0) && 
	     ($excerpted[scalar(@excerpted - 1)] =~ /\S/)){
	  ## If there are some lines already excerpted, and if the last line
	  ## excerpted has non-whitespace characters, add a new line.
	  @excerpted = (@excerpted,"\n");
	}
	@excerpted = (@excerpted,"[Quoteblock snipped...]\n","\n");
      }
      if ($line =~ /^-- /){
	last;
      }
      if (($line =~ /\S/) || (@excerpted[scalar(@excerpted - 1)] =~ /\S/)){
	@excerpted = (@excerpted, $line);
      }
    }
  }
  @excerpted = map ("   ".$_, @excerpted);
  if (!(@excerpted[scalar(@excerpted - 1)] =~ /\S/)) {
    $excerpted[scalar(@excerpted - 1)] = "";
  }

  return \@excerpted;
}

##
#
# syntax header_grep(article-id, grep_str)
#
# article_id is the Message-ID of the message to be grepped.
#
# grep_str contains the grep string.
#
# Returns: 1 if article matches some watchlist entry, 0 else
##

sub header_grep{
  my $id = shift;
  my $grep_str = shift;

  my $head_ref = $nntp -> head($id);

  if (defined $head_ref){
    my @headers = @{$head_ref};

    return grep(/$grep_str/, @headers);
  }

  return 0;
}

######################################################################
#
# END FIND AND RECORD MATCHES SECTION
#
######################################################################


##
#
# init_cfg: Defines the command line and init file configuration.
#
##
sub init_cfg{
  $cfg = App::Config->new({  
			   ERROR     => \&panic,
			  });

  $cfg->define("help",{
		       CMDARG    => ['--help', '-?'],
		       ARGCOUNT  => 0,
		      });

  $cfg->define("debug",{
			ARGCOUNT => 0,
			CMDARG => ['--debug','-d'],
		       });

  $cfg->define("verbose",{
			ARGCOUNT => 0,
			CMDARG => ['--verbose','-v'],
		       });
  $cfg->define("fromtime",{
			  ARGCOUNT => 1,
			  CMDARG => ['--time','-t'],
			  }
	      );
}

##############################################################
# main
##############################################################

init_cfg;
$cfg->cmd_line(\@ARGV);

if ($cfg->get("help")){
  help;
}
else {
  open_watch_file_for_input;
  read_watch_file;

  if (process_watch_list){
    if ($cfg->debug){
      foreach (@email_output){
	print $_;
      }
    }
    else{
      write_email("Egowatch.pl notification",@email_output);
    }
  }
  elsif ($cfg->verbose){
    print "No matches.\n";
  }

  $nntp->quit;

  open_watch_file_for_output;
  write_watch_list;

}