#!/usr/bin/perl

# RSSGEN.pl
# Generates RSS feed from website and definition file.
# The definition file describes how to create RSS form webpage. Then, this RSS can be used in your reader.
#
# TO WRITE THE DEFINITION FILE:
#  Line 1: The address or addresses to download
#  Line 2: The selector of title.
#  Line 3: The selector of the link. Usually the same as line 2, content of href is used by default
#  Line 4: The selector of content.
#  Line 5: the ;-separated kill-list of words for both title and content
#  Line 6: The ;-separated kill-list of words in title
#  Line 7: The ;-separated kill-list of words in content
#  Line 8: The ;-separated kill-list of words to delete in content/title.
# If there is a word from kill-list, the item is not added to the RSS except line 8 when word is deleted.
#
# If you want to get a-tag with class "ex yy zz" you do: 
#  //a[@class="ex yy zz"]
# So generally //tag[@attr="..."]
#
# MULTIPLE ADDRESSES: Separate them with the sequence: ; 
#                   EXACTLY THIS: Space Semicolon Space.
#
# USAGE:
#  rssgen.pl deifnitionfile output.xml
#
# MCbx 2020

use warnings;
use strict;
use LWP::UserAgent;
use HTML::TreeBuilder::XPath qw();

#Parse the file
if ($#ARGV != 1)
{	
        print "Use rssgen inputfile feed.xml\n";
        exit 1;
}
print "INSTALLATION\n";

open (my $fh,'<',$ARGV[0]) or die "Cannot open input file!";
my $rurl="";
my $header="";
my $link="";
my $content="";
my $killall="";
my $killtitle="";
my $killcontent="";
my $eraser="";
my $linecount=0;

while (my $line=<$fh>)
{
   $line=~s/\n//g;
   $line=~s/\r//g;
   if ($linecount==0)
   {
      $rurl=$line;
   }
   if ($linecount==1)
   {
      $header=$line;
   }
   if ($linecount==2)
   {
      $link=$line;
      if ($link eq "")
      {
         $link=$header;
      }
   }
   if ($linecount==3)
   {
      $content=$line;
   }
   if ($linecount==4)
   {
      $killall=$line;
   }
   if ($linecount==5)
   {
      $killtitle=$line;
   }
   if ($linecount==6)
   {
      $killcontent=$line;
   }
   if ($linecount==7)
   {
      $eraser=$line;
   }
   $linecount++;
}

if ($rurl eq "")
{
   print "Cannot load empty URL. File probably wrong.\n";
   exit;
}

#prepare killlists
my @urls;
my @killall;
my @killtitle;
my @killcontent;
my @eraser;

@urls = split(" ; ",$rurl);
if ($killall ne "")
{
   @killall=split(";",$killall);
}
if ($killtitle ne "")
{
   @killtitle=split(";",$killtitle);
}
if ($killcontent ne "")
{
   @killcontent=split(";",$killcontent);
}
if ($eraser ne "")
{
   @eraser=split(";",$eraser);
}
close $fh;

#Preparation of feed:
open (my $oh,'>',$ARGV[1]) or die ("Cannot open target file for writing.");
print $oh "<?xml Version=\"1.0\" encoding=\"UTF-8\" ?>\n<rss version=\"2.0\">\n<channel>\n";

my $q=0;
my $r=0;
print "Loaded $linecount lines from file\n.";
my $gq=0;
my $gr=0;
foreach my $url (@urls)
{
print "Downloading url $url\n";

my $ua = new LWP::UserAgent;
$ua->timeout(60);
my $request=new HTTP::Request('GET',$url);
my $response = $ua->request($request);
my $page=$response->content();

#print $page;
#exit;

print "Parse...\n";

my $t=HTML::TreeBuilder::XPath->new;
$t->parse($page);
my @headers=$t->findnodes($header);
my @urls = $t->findnodes($link);
my @contents =$t->findnodes($content);
$q=0;
$r=0;


#THIS IS MAIN ITEM LOOP
my $XTitle="";
my $XLink="";
my $XDesc="";
foreach my $header (@headers)
{
   #ITEM PROCESSING
   
   $XTitle="";
   $XLink="";
   $XDesc="";
   
   #1. Get the content of $header.
   $XTitle=$header->as_trimmed_text;
   
   
   #2. Get the href-attrib of $urls[$q].
   my $durl=$urls[$q]->as_XML_indented;
   my $start='href="';
   my $end='"';
   $durl=~/$start(.*?)$end/;
   $XLink=$1;
   
   #2a. Fix the goddamn URL if it's not!
   if (rindex($XLink,"http")==-1)
   {
      #determine the baseurl
      my $url2=$url;
      $start="https://";
      $end="/";
      $url2=~/$start(.*?)$end/; #just ripping this
      $url2=$1;
      $XLink="https://".$url2.$XLink;
   }
   
   #3. Get the content of $contents[$q]
   $XDesc= $contents[$q]->as_trimmed_text;
   #This is all.
   
   $q++;
   $gq++;
   #EVALUATION OF $XTITLE $XLINK AND $XDESC VS KILL_LISTS (killall killtitle, killcontent)
   my $fail=0;
   foreach my $kall (@killall)
   {
      if ($XTitle =~ m/$kall/i )
      {
         $fail=1;
         next;
      }
      if ($XDesc =~ m/$kall/i )
      {
         $fail=1;
         next;
      }
   }
   foreach my $ktitle (@killtitle)
   {
      if ($XTitle =~ m/$ktitle/i)
      {
         $fail=1;
         next;
      }
   }
   foreach my $kcontent (@killcontent)
   {
      if ($XDesc =~ m/$kcontent/i)
      {
         $fail=1;
         next;
      }
   }
   
   #ERASER OPERATION
   foreach my $word (@eraser)
   {
      $XTitle=~s/$word//g;
      $XDesc=~s/$word//g;
   }
   
   if ($fail==0)
   {
      $r++;
      $gr++;  
      #GENERATE FEED 
      print $oh "<item>\n<title>$XTitle</title>\n<link>$XLink</link>\n<description>$XDesc</description>\n</item>\n";
   }
}
print "Processed in this page: $q, shown $r.\n";

}
print $oh "</channel>\n</rss>\n";
close $oh;
print "END OF SCRIPT. PROCESSED $gq ITEMS, SHOWN $gr ITEMS.\n";
