./add-and-refresh-from-db.sh
#!/bin/sh # 2022-07-26 PATH=/usr/local/bin:/usr/bin:/bin case $USER in 'tuxmachines') author='Tux Machines' ;; 'roy') author='Roy Schestowitz' ;; 'rianne') author='Rianne Schestowitz' ;; 'marius') author='Marius Nestor' ;; 'arindam') author='Arindam Giri' ;; 'trendoceans') author='Trend Oceans' ;; *) author=$USER ;; esac # add a record tm-add-entry-sql.pl -a "$author" # update both the XHTML and Gemtext hierarchies refresh-site-from-db.sh exit 0
./refresh-site-from-db.sh
#!/bin/sh
# 2022-07-25
PATH=/usr/local/bin:/usr/bin:/bin
umask 0002
closure() {
test -d ${tmpdir} || exit 1
echo "Erasing temporary directories and their files."
rm -f ${tmpdir}/feed-*tmp.*
rmdir ${tmpdir}
}
cancel() {
echo "Cancelled."
closure
exit 2
}
documentroot=/var/www/tuxmachines.org/htdocs
# trap various signals to be able to erase temporary files
trap "cancel" 1 2 15
# prepare final permissions
echo "Creating temporary directories and files"
tmpdir=$(mktemp -d /tmp/refresh-tmp.XXXXXX)
chgrp webmasters ${tmpdir}
chmod g=rwxs ${tmpdir}
# one file per feed
tmpfile_latest=$(mktemp -p ${tmpdir} feed-latest-tmp.XXXXXXX)
tmpfile_xhtml=$(mktemp -p ${tmpdir} feed-xhtml-tmp.XXXXXXX)
tmpfile_gemini=$(mktemp -p ${tmpdir} feed-gemini-tmp.XXXXXXX)
# create static XHTML and GemText
echo "Creating static XHTML and GemText hierarchies"
tm-extract-posts-sql.pl -f -d $(date -d '-2 days' +"%Y%m%d") -s
# make a list of new posts for an SSI include file
echo "Updating SSI files"
tm-generate-feed.pl \
-d $(date -d '-2 days' +'%Y%m%d') \
-n 15 \
-x \
> ${tmpfile_latest}
if test -s ${tmpfile_latest}; then
mv ${tmpfile_latest} ${documentroot}/latest-news.html
chmod 664 ${documentroot}/latest-news.html
fi
# write out an RSS feed for HTTP
echo "Writing the RSS feed for HTTP"
tm-generate-feed.pl \
-a \
-d $(date -d '-2 days' +'%Y%m%d') \
-n 15 \
-x \
> ${tmpfile_xhtml}
if test -s ${tmpfile_xhtml}; then
mv ${tmpfile_xhtml} ${documentroot}/feed.xml
chmod 664 ${documentroot}/feed.xml
fi
# write out an Atom feed for Gemini
echo "Writing the Atom feed for Gemini"
tm-generate-feed.pl \
-a \
-d $(date -d '-2 days' +'%Y%m%d') \
-n 15 \
-g \
> ${tmpfile_gemini}
if test -s ${tmpfile_gemini}; then
mv ${tmpfile_gemini} /home/gemini/gemini/feed.xml
chmod 664 /home/gemini/gemini/feed.xml
fi
# fix up the Gemini index
echo "Writing the Gemini index"
tm-generate-gemtext-index.sh
# notify via MQTT
echo "Pinging via MQTT"
doas -u tuxmachines /home/tuxmachines/bin/tm-monitor-site-updates.sh
closure
exit 0
./rss-since-scraper.config
# https://lwn.net/headlines/newrss https://9to5linux.com/feed/ # https://fossforce.com/feed/ https://www.internetsociety.org/feed/ http://feeds.feedburner.com/Ubuntubuzz https://blog.torproject.org/rss.xml https://linuxgizmos.com/feed/
./rss-since-scraper.pl
#!/usr/bin/perl -T
# 2021-05-16
# XML RSS and Atom feed web scraper,
# feed it URLs for feeds plus a date-time stamp
# entries will be parsed and can saved in a file
# local times will be converted to UTC
use utf8;
use Getopt::Std;
use Time::ParseDate;
use Time::Piece;
use XML::Feed;
use URI;
use LWP::UserAgent;
use HTTP::Response::Encoding;
use HTML::TreeBuilder::XPath;
use HTML::Entities;
use English;
use strict;
use warnings;
our $VERBOSE = 0;
$OUTPUT_AUTOFLUSH=1;
# work-arounds for 'wide character' error from wrong UTF8
binmode(STDIN, ":encoding(utf8)");
binmode(STDOUT, ":encoding(utf8)");
our %opt;
getopts('ad:ho:tuvL', \%opt);
my $script = $0;
if (defined($opt{'h'})) {
&usage($script);
}
if (defined($opt{'v'})) {
$VERBOSE++;
}
my ($output);
if (defined($opt{'o'})) {
# XXX needs proper sanity checking for path and filename at least
$output = $opt{'o'};
$output =~ s/[\0-\x1f]//g;
if ($output =~ /^([-\/\w\.]+)$/) {
$output = $1;
} else {
die("Bad path or file name: '$output'\n");
}
} else {
$output = '/dev/stdout';
}
my $utc = 0; # treat input as a local time and convert to UTC
if (defined($opt{'u'})) {
$utc = 1; # treat input as UTC without conversion
}
my $sdts;
if (defined($opt{'d'})) {
$sdts = parsedate($opt{'d'}, GMT=>$utc);
} else {
$sdts = parsedate('yesterday');
}
print STDERR qq(S=$sdts\n)
if ($VERBOSE);
my $t = Time::Piece->strptime($sdts, '%s');
print STDERR qq(D=),$t->strftime("%a, %d %b %Y %H:%M:%S %Z"),qq(\n)
if ($VERBOSE);
my $count = 0;
my $errors = 0;
while (my $url = shift) {
next if ($url =~ /^\s*#/); # skip comments
print STDERR qq(\nU=$url\n)
if ($VERBOSE);
my $r = &get_feed($t,$url,$output);
if ($r) {
$count++;
} else {
$errors++;
print STDERR qq(Could not find feed at URL: "$url"\n);
}
}
&usage($script) unless ($count || $errors);
exit(0);
sub usage {
my ($script) = (@_);
$script =~ s/^.*\///;
print <<EOH;
USAGE:
$script [-ahuvL] [-o file] [-d date/date-time] feed-url [feed-url...]
-a appends the file specified by -o instead of the default of
overwriting it.
-d is the date-time stamp before which feed entries published prior
to that will be ignored. Default is "yesterday" at the
current time. The format is yyyy-mm-dd or yyyy-mm-ddThh:mm
-o points to the file for collecting output, it is stdout by default.
-u treats start date as UTC, default is to use the local time zone.
-v show debugging output on stderr.
-L suppress use of <li> elements but leave the others.
-h shows this message.
Multiple feed URLs can be specified.
Queries and fragments are trimmed from the URIs.
Broken or malformed feeds will be skipped completely.
EXAMPLES:
$script -u -d 2019-08-01T00:00 http://example.com/ https://example.org/
$script -o /tmp/foo.html http://example.com/
$script -a -o /tmp/foo.html -d 2019-08-01 https://example.com/
The date for the -d option can be made using command substitution
and the date(1) utility.
$script -d \$(date -d '2 days ago' +'%Y-%m-%d') https://example.com/
KNOWN BUGS:
As a work-around for UTF-8 in Chromium and Firefox, meta elements
declaring UTF-8 explicitly are peppered through the output. The
placement cannot really be helped and the result is not valid XHTML
because these are in the wrong part of the document.
And it goes without saying that scraping sites is very brittle and
can stop working with even minor changes to the page structure.
EOH
exit(0);
}
sub get_feed {
my ($t,$url,$output) = (@_);
my $uri = $url;
my $feed;
eval {
$feed = XML::Feed->parse(URI->new($uri));
};
if ($@) {
print STDERR $@,qq(\n);
print STDERR qq( Failed feed for '$uri'\n);
return(0);
} elsif (! defined($feed)) {
return(0);
}
my $feed_title;
eval {
$feed_title = $feed->title;
};
if ($@) {
print STDERR $@,qq(\n);
print STDERR qq( Failed title for '$uri'\n);
return(0);
}
my $feed_modified = encode_entities($feed->modified); # unsupported
my $feed_format = encode_entities($feed->format);
print STDERR qq(\tT=$feed_title\n)
if ($VERBOSE);
print STDERR qq(\tF=$feed_format\n)
if ($VERBOSE);
my @entries = &read_entries($t,$feed,$output);
if(@entries) {
my $mode;
if (defined($opt{'a'})) {
$mode = '>>';
} else {
$mode = '>';
}
# print STDERR Dumper($feed);
open(my $out, $mode, $output)
or die("Could not open '$output' for appending: $!\n");
# work-around for browser not recognizing UTF-8 automatically
# print $out qq(<meta charset="utf-8" />\n);
binmode($out, ":encoding(utf8)");
if (defined($opt{'t'})) {
if (defined($opt{'L'})) {
print $out qq(<div>\n);
}
print $out qq(<h3><a href="$url">$feed_title</a></h3>\n);
}
print $out join("", @entries);
if (defined($opt{'L'})) {
print $out qq(</div>\n);
}
close($out);
}
return(1);
}
sub read_entries {
my ($t,$feed,$output) = (@_);
$t = parsedate($t);
my @entries = ();
my $count = 0;
foreach my $entry ($feed->entries) {
# print STDERR Dumper($entry),qq(\n\n)
# if($VERBOSE);
# entry time
my $ft = $entry->{entry}{pubDate}
|| $entry->issued
|| $entry->modified;
# entry time in seconds
my $et = parsedate($ft) || 0;
next unless($et =~ /^\d+$/ && $et >= $t );
# these links are sometimes redirections from proxies
my ($base, $content) = &fetch_page($entry->link)
or die("Missing content from '",$entry->link,"'\n");
next if ($base eq -1 || $content eq -1);
next if ($base =~ /^\d+/ && $base<0);
print STDERR qq(Fetched:),substr($base,0,30),qq(\n)
if ($VERBOSE);
my $uri = URI->new($base)
or die("Bad address, '$base', could not form URI\n");
$uri->query(undef);
$uri->fragment(undef);
my $site = $uri->authority;
# many sites are under feedburner
if ($site eq 'feeds.feedburner.com') {
if ($VERBOSE) {
print STDERR qq(A=Feed Burner\n);
}
if($uri->path =~ /^projectcensored/) {
$site = 'www.projectcensored.org';
} elsif($uri->path =~ /^johnpilger/) {
$site = 'johnpilger.com';
} elsif($uri->path =~ /^cubexyz.blogspot.com/) {
$site = 'cubexyz.blogspot.com';
} elsif($uri->path =~ /^LnuxTech-lb/) {
$site = 'linuxtechlab.com';
} elsif($uri->path =~ /^www.privateinternetaccess.com/) {
$site = 'www.privateinternetaccess.com';
} elsif($uri->path =~ /^original.antiwar.com/) {
$site = 'original.antiwar.com';
} elsif($uri->path =~ /^\~r\/MichaelGeistsBlog/) {
$site = 'www.michaelgeist.ca';
} elsif($uri->path =~ /^EliveLinuxWebsiteUpdates/) {
$site = 'www.elivecd.org';
} elsif($uri->path =~ /^www.tecmint.com/) {
$site = 'www.tecmint.com';
}
}
print STDERR qq(A=$site\n)
if ($VERBOSE);
# remove spammy, paid-for press releases
if ($site eq 'www.commondreams.org') {
# LLL - todo
}
&scan_for_scripts($site, $content);
my $o = &choose_parser($site, $uri->canonical, $content);
if ($o) {
$count++;
push(@entries, $o);
} else {
# identify the feed which had the error
print STDERR qq(\t),$feed->title,qq(\n);
}
print STDERR qq(\t\t),$base,qq(\n)
if ($VERBOSE);
}
# not needed with <div>
# if ($count) {
# push(@entries, qq(\n<hr />\n\n));
# }
return(@entries);
}
sub fetch_page {
my ($uri) = (@_);
my $ua = LWP::UserAgent->new;
$ua->agent("NotRSS0day/0.1");
my $request = HTTP::Request->new(GET => $uri);
my $result = $ua->request($request);
if ($result->is_success) {
return($result->base, $result->decoded_content);
} else {
warn("Could not open '$uri' : ", $result->status_line, "\n");
return(-1,-1);
}
return(0,0);
}
sub scan_for_scripts {
my ($site, $content) = (@_);
my $ent = HTML::TreeBuilder::XPath->new_from_content($content);
for my $t ($ent->findnodes('script')) {
print STDERR qq(script payload found in $site !\n);
exit(2);
}
$ent->delete;
return(1);
}
sub choose_parser {
my ($site, $url, $content) = (@_);
my ($xpath_title, $xpath_description) = (0,0);
my ($title, $description) = (0,0);
print STDERR qq(S=$site\n)
if ($VERBOSE);
if ($site eq '9to5linux.com') {
$xpath_title = '//h1[@class="entry-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entry-content"]/p[position()<3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.aclu.org') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div/div[@class="panel-pane pane-aclu-components-description description"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'anniemachon.ch') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entry-content"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'original.antiwar.com') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$title =~ s/\s+\W\s+Antiwar.com Original//;
$xpath_description = '//div[@class="entry-content"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'ar.al') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//body/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'archlinux.org') {
$xpath_title = '//h2[@itemprop="headline"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="article-content"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'blog.arduino.cc') {
$xpath_title = '//div[@class="post"]/h3[1]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entry"]/p[position()<3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'blog.benjojo.co.uk') {
$xpath_title = '//head/title';
$title = parse_title($xpath_title, $content);
$xpath_description = '//h1/following-sibling::p[position()<3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.bunniestudios.com') {
$xpath_title = '//h2[1]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//h2/following-sibling::div[1]/p[position()<3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'kb.cert.org') {
$xpath_title = '//div/div/div/div[@class="large-12 columns"]/h2';
$title = parse_title($xpath_title, $content);
$xpath_description = '//head/meta[@name="Description"]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.commondreams.org') {
return(0) if ($url =~/\/newswire\//);
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
# $xpath_description = '//head/meta[@name="description"]';
$xpath_description = '//div[@class="views-article__body prose node__body"]/p[1]';
$description = parse_description($xpath_description, $content);
unless($description) {
$xpath_description = '//div[@class="headline__body prose node__body"]/p[position()<3]';
$description = parse_description($xpath_description, $content);
}
} elsif ($site eq 'www.counterpunch.org') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$title =~ s/\s+-\s+CounterPunch.org//;
# $xpath_description = '//div[@class="story-header-area"]/p[1]';
$xpath_description = '//div[@class="story-header-area"]/p[position()<3 and not(contains(text(),"Subscribers content"))]';
$description = parse_description($xpath_description, $content);
$description = 0 if($description =~ /We don't shake our/);
unless($description) {
$xpath_description = '//div[@class="post_content"]/p[position()>1 and position()<4 and not(contains(text(),"Subscribers content"))]';
$description = parse_description($xpath_description, $content);
}
} elsif ($site eq 'couragefound.org') {
$xpath_title = '//html/head/meta[@name="twitter:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entry-content"]/p[2]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'cpj.org') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
# .col-sm-7 > article:nth-child(1) > p:nth-child(3)
$xpath_description = '//div[@class="col-sm-7"]/p[1]';
$description = parse_description($xpath_description, $content);
$description =~ s/>[^>]*—/>/;
} elsif ($site eq 'climatenewsnetwork.net') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$title =~ s/\s+\W\s+Climate News Network//;
$xpath_description = '//div[@class="entry-content-post"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.craigmurray.org.uk') {
$xpath_title = '//html/head/meta[@name="twitter:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//h1/following-sibling::p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'creativecommons.org') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$title =~ s/\s+\W\s+Creative Commons//;
$xpath_description = '//div[@class="entry-content"]/p[1]';
$description = parse_description($xpath_description, $content);
unless($description eq '<blockquote><p></p></blockquote>') {
$xpath_description = '//div[@class="entry-content"]/p[2]';
$description = parse_description($xpath_description, $content);
}
} elsif ($site eq 'cubexyz.blogspot.com') {
$xpath_title = '//html/head/title';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@id="mainClm"]/div[@class="blogPost"]';
$description = parse_description($xpath_description, $content);
$description =~ s/\s+//;
# $description =~ s/\s\s+.*<\/blockquote>/<\/blockquote>/m;
} elsif ($site eq 'danielmiessler.com') {
$xpath_title = '//h1[@class="entry-title"]';
$title = parse_title($xpath_title, $content);
# remove podcasts
return(0) if ($title =~ m/Unsupervised Learning: No\./);
$xpath_description = '//div[@class="entry-content"]/p[position()>=last()-1]';
$description = parse_description($xpath_description, $content);
# remove adverts for social control media
my $de = HTML::TreeBuilder::XPath->new_from_content($description);
for my $p ($de->findnodes('//p')) {
if($p->as_text =~ m/^Discuss on Tw/) {
$p->delete;
}
}
$description = $de->as_XML_compact;
$de->delete();
$description =~ s/^.*(<blockquote>)/$1/;
$description =~ s/(<\/blockquote>).*$/$1/;
} elsif ($site eq 'dataswamp.org') {
$xpath_title = '//h1[1]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//h1/following-sibling::p[position()>1 and position()<4]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.democracynow.org') {
$xpath_title = '//h1[1]';
$title = parse_title($xpath_title, $content);
return(0) if ($title =~ m/recent shows/i);
return(0) if ($title =~ m/^headlines/i);
$xpath_description = '(//div[@class="headline_body"]/div[@class="headline_summary"]/p[1])[1]';
$description = parse_description($xpath_description, $content);
unless($description) {
$xpath_description = '(//div[@class="text"]/p[1])[1]';
$description = parse_description($xpath_description, $content);
}
} elsif ($site eq 'www.digitalmusicnews.com') {
$xpath_title = '//html/head/title';
$title = parse_title($xpath_title, $content);
$title = failed_utf($title);
$xpath_description = '//div[@id="main"]//h2';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.desmog.com') {
$xpath_title = '//div[@class="elementor-widget-container"]/h1';
$title = parse_title($xpath_title, $content);
# $xpath_description = '//div[@class="elementor-widget-container"]/div/p[position()<3]';
$xpath_description = '(//div[@class="elementor-widget-container"]/div/p)[position()<3]';
$description = parse_description($xpath_description, $content);
# xxx work-around to eliminate site signature :(
$description =~ s/<p>Website by.*//ms;
} elsif ($site eq 'www.desmogblog.com') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="field-items"]/div[1]/p[2]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'thedissenter.org') {
$xpath_title = '//h1[1]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="content"]/p[position()<3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'dontextraditeassange.com') {
$xpath_title = '//div[@class="entry-categories"]/following-sibling::h1[@class="entry-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entry-content"]/p[position()>1 and position()<3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.dw.com') {
$xpath_title = '//div[1]/h1[1]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[1]/h1[1]/following-sibling::p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.elivecd.org') {
$xpath_title = '//h1[@class="post-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="post-content"]/h5[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.theenergymix.com') {
# lll
$xpath_title = '//h1[@class="jeg_post_title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="content-inner"]/p[position()<3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.eff.org') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
# work-around for something broken with p[1]
$xpath_description = '//div[@class="field__items"]/div[1]/p[position()>1 and position()<=4]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.exposedbycmd.org') {
$xpath_title = '//h1[@class="entry-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entry-content"]/p[position()<=2]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'fair.org') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
# .entry-content > p:nth-child(4)
$xpath_description = '//div[@class="entry-content"]/p[2]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'femtejuli.se') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//html/head/meta[@property="og:description"]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'ferd.ca') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//h2/following-sibling::p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'fortran-lang.org') {
$xpath_title = '//div[@class="newsletter col-wide"]/h1[1]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="newsletter col-wide"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'fossforce.com') {
$xpath_title = '//div//h1[@class="post-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="post-content"]/p[position()<3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.fossmint.com') {
$xpath_title = '//h1[@class="entry-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entry-content"]/p[position()<3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.france24.com') {
$xpath_title = '//html/head/title';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="t-content t-content--article"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.gamingonlinux.com') {
$xpath_title = '//div/h1[@class="title p-name"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="content group e-content"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'godotengine.org') {
# lll
$xpath_title = '//div[@class="info"]/h1[1]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="info"]/following-sibling::p[position()<3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'thegrayzone.com') {
$xpath_title = '//h1[@class="entry-title" and 1]';
unless($title) {
$xpath_title = '//h1[1]';
$title = parse_title($xpath_title, $content);
}
$xpath_description = '//div[@class="entry-content"]/h3[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.greenparty.org.uk') {
# LLL fix this above with $et, does not currently get this far
$xpath_title = '//div[@class="threequarters"]/h1[1]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="threequarters"]/h1[1]/following-sibling::p[3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'hackaday.com') {
$xpath_title = '//h1[@class="entry-title"]';
$title = parse_title($xpath_title, $content);
# $xpath_description = '//html/head/meta[@property="og:description"]';
$xpath_description = '//div[@class="entry-content"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.hrw.org') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
# $xpath_description = '//html/head/meta[@property="og:description"]';
$xpath_description = '//div[@class="article-body article-body--contained"]/p[position()<3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'infojustice.org') {
$xpath_title = '//h1[@class="entry-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="post-content entry-content"]/p[position()>1 and position()<4]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'insighthungary.444.hu' or $site eq '444.hu') {
$xpath_title = '//div[@id="headline"]/h1';
$title = parse_title($xpath_title, $content);
$xpath_description = '//p[position()<3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.itwire.com') {
$xpath_title = '//h2[@class="itemTitle"]';
$title = parse_title($xpath_title, $content);
$title =~ s/\s+Featured.*//; # should have been in XPath instead
$xpath_description = '//div[@class="itemIntroText"]/p';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'jacobinmag.com') {
$xpath_title = '//body/h1[@class="po-hr-cn__title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//h1/following-sibling::p[@class="po-hr-cn__dek"]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'johnpilger.com') {
$xpath_title = '//html/head/title';
$title = parse_title($xpath_title, $content);
$title = &title_case($title);
$xpath_description = '//div[@class="text book last full" and position()=1]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'krebsonsecurity.com') {
$xpath_title = '//div/h1[@class="entry-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entry-content"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'kubernetes.io') {
$xpath_title = '//div[@class="content"]/h1';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="td-content"]/p[position()>1 and position() < 5 and not(preceding-sibling::h2)]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.laquadrature.net') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entry-content entry-content-single"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.lightbluetouchpaper.org') {
$xpath_title = '//h1[@class="entry-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entry-content"]/p[position()<=2]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.linuxandubuntu.com') {
$xpath_title = '//div/h1[@class="alignwide wp-block-post-title"]';
$title = parse_title($xpath_title, $content);
#lll
$xpath_description = '//div[contains(@class, "entry-content")]/p[position() < 5 and not(preceding-sibling::h2)]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.linuxbuzz.com') {
$xpath_title = '//div[@class="inside-article"]/h1[@class="entry-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entry-content"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.linuxcloudvps.com') {
$xpath_title = '//div[@class="main"]/h2[1]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//p[position()>1 and position()<3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'linuxhandbook.com') {
$xpath_title = '//div/h1[@class="hero__title text-center"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="content js-toc-content"]/p[1]';
$description = parse_description($xpath_description, $content);
# skip newsletters and such
if(!$description) {
return(0);
}
} elsif ($site eq 'www.linuxtechi.com') {
$xpath_title = '//div/h1[@class="title entry-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="nv-content-wrap entry-content"]/p[position()<3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'linuxgizmos.com') {
$xpath_title = '//div[@class="post"]/h2';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entrytext"]/p[position()<3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'linuxtechlab.com') {
$xpath_title = '//h1[1]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="text"]/p[position()<3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'lunduke.com') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//html/head/meta[@property="og:description"]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'markcurtis.info') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$title =~ s/\s+\W\s+Meduza//;
$xpath_description = '//div[@class="entry-content"]/p[position()>=3 and position()<=4]';
$description = parse_description($xpath_description, $content);
unless($description) {
# some do not have the extra byline
# but it is hard to parse which do:
$xpath_description = '//div[@class="entry-content"]/p[position()>=2 and position()<=3]';
$description = parse_description($xpath_description, $content);
}
} elsif ($site eq 'meduza.io') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$title =~ s/\s+\W\s+Meduza//;
$xpath_description = '//div[@class="GeneralMaterial-article"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.michaelgeist.ca') {
$xpath_title = '//h1[@class="title"]';
$title = parse_title($xpath_title, $content);
return(0) if($title=~/^The LawBytes Podcast/);
$xpath_description = '//div[@class="entry"]/p[last()]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.michaelwest.com.au') {
$xpath_title = '//html/head/title';
$title = parse_title($xpath_title, $content);
$title =~ s/\s+\W\s+Michael West.*//;
$xpath_description = '//head/meta[@property="og:description"]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.mintpressnews.com') {
$xpath_title = '//head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entry-content"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'blog.mozilla.org') {
$xpath_title = '//div[1]/h1[1]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="ft-c-single-post__body"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.thenation.com') {
$xpath_title = '//div[@class="article-header-content"]/h1[@class="title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="article-body-inner"]/p[position()<3 and @class!="caption"]';
$description = parse_description($xpath_description, $content);
$description =~ s/[\d\s]*Ad Policy.*$//i;
} elsif ($site eq 'newmatilda.com') {
$xpath_title = '//html/head/title';
$title = parse_title($xpath_title, $content);
$title =~ s/\s+\W\s+New Matilda.*//;
$xpath_description = '//div/div[@class="post-content text-font description"]/p[2]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'oceanservice.noaa.gov') {
$xpath_title = '//html/head/title';
$title = parse_title($xpath_title, $content);
$title =~ s/\s+\W\s+Michael West.*//;
$xpath_description = '//head/meta[@property="og:description"]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'off-guardian.org') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//h6/following-sibling::p[@class="dropcap"]';
$description = parse_description($xpath_description, $content);
unless($description) {
$xpath_description = '//div[@class="transcript"]/p[1]';
$description = parse_description($xpath_description, $content);
}
} elsif ($site eq 'papersplease.org') {
$xpath_title = '//h1[@class="post-title entry-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entry-content"]/p[position()<4]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'news.opensuse.org') {
$xpath_title = '//h1[@class="decorated-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="col-md-7 col-12 mx-auto text-justify"]/p[position() <3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'opensource.com') {
$xpath_title = '//h1[@class="published page-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@id="article_content"]//div[@class="clearfix text-formatted field field--name-body field--type-text-with-summary field--label-hidden field__item"]/p[not(preceding-sibling::h2) and position() < 5]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'opensourcesecurity.io') {
$xpath_title = '//h1[@class="entry-itle"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entry-content"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'ostechnix.com') {
$xpath_title = '//div/h1[@class="post-title single-post-title entry-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="inner-post-entry entry-content"]/div/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.pclinuxos.com') {
$xpath_title = '//div[@class="title"]/h2[1]';
$title = parse_title($xpath_title, $content);
$title =~ s/^\s+//;
$xpath_description = '//div[@class="entry"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'perens.com') {
# header.entry-header h1.entry-title
$xpath_title = '//h1[@class="entry-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entry-content"]/p[1]';
$description = parse_description($xpath_description, $content);
unless($description) {
$xpath_description = '//div[@class="entry-content"]/descendant::p[1]';
$description = parse_description($xpath_description, $content);
}
} elsif ($site eq 'perlweeklychallenge.org') {
$xpath_title = '//html/head/title';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="post-content"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.projectcensored.org') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="inner-post-entry entry-content"]/p[2]';
$description = parse_description($xpath_description, $content);
if(!$description || $description =~ /Listen to all of our previous/) {
$xpath_description = '//div[@id="penci-post-entry-inner"]/div/div/div[1]';
$description = parse_description($xpath_description, $content);
}
if(!$description || $description =~ /Listen to all of our previous/) {
$xpath_description = '//div[@id="penci-post-entry-inner"]/p[1]';
$description = parse_description($xpath_description, $content);
}
if(!$description || $description =~ /Listen to all of our previous/) {
$xpath_description = '//div[@id="penci-post-entry-inner"]/div/div[1]';
$description = parse_description($xpath_description, $content);
}
if(!$description || $description =~ /Listen to all of our previous/) {
$xpath_description = '//div[@id="penci-post-entry-inner"]/div[1]';
$description = parse_description($xpath_description, $content);
}
} elsif ($site eq 'pluralistic.net') {
1;
# placeholder
} elsif ($site eq 'www.privateinternetaccess.com') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="detail-ct"]/p[2]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'projects.propublica.org') {
$xpath_title = '//html/head/title';
$title = parse_title($xpath_title, $content);
$title =~ s/\s*\|.*$//;
$xpath_description = '//html/head/meta[@property="og:description"]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'features.propublica.org') {
$xpath_title = '//html/head/title';
$title = parse_title($xpath_title, $content);
$xpath_description = '//html/head/meta[@property="og:description"]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.propublica.org') {
# $xpath_title = '//html/head/meta[@name="dcterms.Title"]';
$xpath_title = '//html/head/meta[@property="headline"]';
$title = parse_title($xpath_title, $content);
unless($title) {
$xpath_title = '//h2[@class="hed"]';
$title = parse_title($xpath_title, $content);
}
$xpath_description = '//div[@class="article-body"]/p[position()<=2]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.openrightsgroup.org') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="sixteen columns"]/*/p[1]';
$description = parse_description($xpath_description, $content);
unless ($description) {
$xpath_description = '//div[@class="sixteen columns"]/p[1]';
$description = parse_description($xpath_description, $content);
}
} elsif ($site eq 'puri.sm') {
$xpath_title = '//div[@class="container"]/h1[1]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="blog-entry e-content"]/p[not(preceding-sibling::h1) and position() < 4]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.qt.io') {
$xpath_title = '//div[@class="h-wysiwyg-html/h1"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//span[@id="hs_cos_wrapper_post_body"]/p[position()<3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'rakudoweekly.blog') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '(//div[@class="entry-content"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.raspberrypi.org') {
$xpath_title = '//h1[2]';
$title = parse_title($xpath_title, $content);
unless ($title) {
$xpath_title = '//h1[1]';
$title = parse_title($xpath_title, $content);
}
$xpath_description = '(//div[@class="entry-content"]/p[1]';
$description = parse_description($xpath_description, $content);
unless($description) {
$xpath_description = '//div[contains(@class,"c-post-content")]/p[1]';
$description = parse_title($xpath_description, $content);
}
} elsif ($site eq 'www.redhat.com') {
$xpath_title = '//div[@class="rh-article-teaser--component"]/h1';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[starts-with(@class,"rh-generic")]//p[not(preceding-sibling::h3) and position() < 3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'respectfulinsolence.com') {
$xpath_title = '//h1[@class="entry-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '(//div[@class="entry-content"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'therevelator.org') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$title =~ s/\s+.bull\;.*//;
$title =~ s/\s+•.*//;
$xpath_description = '(//div[@id="entry-content"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.rferl.org') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '(//div[@id="article-content"]/div[1]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'robertreich.org') {
$xpath_title = '//div[@class="caption"]/h2/b';
$title = parse_title($xpath_title, $content);
if (!$title) {
$xpath_title = '//li[@class="post"]/a/h2';
$title = parse_title($xpath_title, $content);
$xpath_description = '(//div[@class="caption"])/p[2]';
$description = parse_description($xpath_description, $content);
} else {
$xpath_description = '(//div[@class="caption"])[last()]/p[last()]';
$description = parse_description($xpath_description, $content);
}
} elsif ($site eq 'robert.ocallahan.org') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '(//div[@class="post-body entry-content"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.rosehosting.com') {
$xpath_title = '//div/h1[@class="entry-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '(//div[@class="entry-content"]/p[not(preceding-sibling::h3) and position() < 3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'shadowproof.com') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
# div.vw-post-content.clearfix p
$xpath_description = '//div[@class="vw-post-content clearfix"]/p[position()<=2]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'scheerpost.com') {
$xpath_title = '//h1[contains(@class,"entry-title")]';
$title = parse_title($xpath_title, $content);
$title =~ s/\s+-\s+CounterPunch.org//;
$xpath_description = '//head/meta[@property="og:description"]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.spiegel.de') {
$xpath_title = '//h2[@class="article-title lp-article-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div/h2/following-sibling::p[@class="article-intro"]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'digit.site36.net') {
$xpath_title = '//h1[1]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entry-content"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'blog.steve.fi') {
$xpath_title = '//html/head/title';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entry-content"]/p[position()>=last()-1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'techcrunch.com') {
$xpath_title = '//html/head/meta[@name="sailthru.title"]';
$title = parse_title($xpath_title, $content);
$title = failed_utf($title);
$xpath_description = '//html/head/meta[@name="sailthru.description"]';
$description = parse_description($xpath_description, $content);
$description = failed_utf($description);
$url =~ s/\?[^\?]*$//;
} elsif ($site eq 'www.techdirt.com') {
$xpath_title = '//h1[@class="posttitle"]';
$title = parse_title($xpath_title, $content);
# remove Daily Deals
return (0) if ($title =~ m/^Daily Deal/);
# remove Funniest
return (0) if ($title =~ m/^Funniest/i);
# skip recaps
return(0) if ($title =~ m/^This Week In Techdirt History/i);
$xpath_description = '//div[@class="byline"]/following-sibling::p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.tecmint.com') {
$xpath_title = '//h1[@class="entry-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entry-content"]/p[position()<3]';
$description = parse_description($xpath_description, $content);
# lll
} elsif ($site eq 'www.technologyreview.com') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[1]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.tedunangst.com') {
# http://www.tedunangst.com/flak/rss
$xpath_title = '//html/head/title';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="byline"]/following-sibling::p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'threatpost.com') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="c-article__intro"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'telex.hu') {
$xpath_title = '//div[1]/div[1]/h1';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="top-section"]/following-sibling::p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'blog.torproject.org') {
$xpath_title = '//h1[@class="title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="body"]/p[position()<3]';
$description = parse_description($xpath_description, $content);
unless($description) {
$xpath_description = '(//p)[2]';
$description = parse_description($xpath_description, $content);
}
} elsif ($site eq 'torrentfreak.com') {
$xpath_title = '//html/head/title';
$title = parse_title($xpath_title, $content);
$title =~ s/\s[\-\*]\sTorrentFreak$//;
return (0) if ($title =~ /Most Torrented Movie of The Week/i);
# '//div[@class="entry-summary"]/p[@class="entry-lead"]'
$xpath_description = '//p[@class="article__excerpt"]';
$description = parse_description($xpath_description, $content);
$url =~ s/\?.*$//;
} elsif ($site eq 'blog.trailofbits.com') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entry-content"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.truthdig.com') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
unless($title) {
$xpath_title = '//html/head/meta[@name="twitter:title"]';
$title = parse_title($xpath_title, $content);
}
$xpath_description = '//div[@class="article-item__content am2-content"]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'truthout.org') {
$xpath_title = '//html/head/title';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@id="article-content"]/p[1]';
$description = parse_description($xpath_description, $content);
unless($description) {
$xpath_description = '//p[@data-pp-id="1.0"]';
$description = parse_description($xpath_description, $content);
}
# LLL - truthout's XHTML has multiple fatal validation errors
# cannot be processed, yet
} elsif ($site eq 'ubuntu.com') {
$xpath_title = '//html/head/title';
$title = parse_title($xpath_title, $content);
$title =~ s/\s+\|.*$//;
$xpath_description = '//div[@class="p-post__content"]//p[not(preceding-sibling::h2) and position() < 3]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.ubuntubuzz.com') {
$xpath_title = '//div[@class="title"]/h1';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entry"]/p[2]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.unixmen.com') {
$xpath_title = '//div/h1[@class="entry-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="td-post-content"]//p[not(preceding-sibling::h2) and position() < 4]';
$description = parse_description($xpath_description, $content);
unless($description) {
$xpath_description = '//div[@class="td-post-content"]/p[position()>2 and position()<5]';
$description = parse_description($xpath_description, $content);
}
} elsif ($site eq 'vitux.com') {
$xpath_title = '//div[@class="post-title-wrapper"]/h1';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="entry-content clearfix"]/p[not(preceding-sibling::h2) and position() < 3]';
$description = parse_description($xpath_description, $content);
unless($description) {
$xpath_description = '//div[@class="entry-content clearfix"]/p[1]';
$description = parse_description($xpath_description, $content);
}
unless($description) {
$xpath_description = '//div[@class="entry-content clearfix"]/p[2]';
$description = parse_description($xpath_description, $content);
}
} elsif ($site eq 'yottadb.com') {
$xpath_title = '//html/head/meta[@property="og:title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div/div[@class="col-sm-20" and position()=3]/p[1]';
$description = parse_description($xpath_description, $content);
} elsif ($site eq 'www.zenwalk.org') {
$xpath_title = '//h3[@class="post-title entry-title"]';
$title = parse_title($xpath_title, $content);
$xpath_description = '//div[@class="post-body entry-content"]';
$description = parse_description($xpath_description, $content);
} else {
# the site does not yet have XPaths, return with an error
print STDERR qq(Site "$site" is not yet configured,);
print STDERR qq(\tSee "$url"\n);
return(0);
}
# LLL - should print warning if no title or description is found
if ( $description !~ /<p>/ ) {
$description = "<blockquote><p>$description</p></blockquote>";
}
return( &print_item($title, $url, $description) );
}
sub parse_title {
my ($xpath_title, $content) = (@_);
my $title = 0;
my $ent = HTML::TreeBuilder::XPath->new_from_content($content);
for my $t ($ent->findnodes($xpath_title)) {
if($t->tag eq 'meta') {
$title = $t->attr('content') || 0;
} else {
$title = $t->as_text || 0;
}
}
$ent->delete;
$title =~ s/\s+$//m;
$title =~ s/^\s+//mg;
$title = encode_entities($title);
return($title);
}
sub parse_description {
my ($xpath_description, $content) = (@_);
my $description = '';
my $ent = HTML::TreeBuilder::XPath->new_from_content($content);
# print STDERR $content,qq(====)x10,qq(\n);
# print STDERR Dumper($ent),qq(\n);
for my $d ($ent->findnodes($xpath_description)) {
if($d->tag eq 'meta') {
$description = encode_entities($d->attr('content'));
$description = '<p>'.$description."</p>\n" || 0;
} elsif($d->tag eq 'p') {
if($d->as_trimmed_text) {
$description = encode_entities($d->as_trimmed_text);
$description = '<p>'.$description."</p>\n";
}
} else {
$description = encode_entities($d->as_trimmed_text);
$description .= $description.qq(\n);
}
}
if ($description) {
$description =~ s/>\s+/>/gm;
$description = qq(<blockquote>$description</blockquote>\n);
}
# delete hidden soft-hyphen and zero-width space trackers
$description =~ s/[\x{00AD}\x{200B}]//g;
$ent->delete;
return($description);
}
sub failed_utf {
my ($text) = (@_);
# crude work-arounds for failed utf-8 / unicode
# $text =~ s/â/'/g;
$text =~ s/\x{2060}//g;
return($text);
}
sub print_item {
my ($title, $url, $description) = (@_);
my $output;
if(!defined($opt{'L'})) {
$output .= qq(<li>);
}
$output .= qq(<h5><a href="$url">$title</a></h5>\n);
if($description) {
$output .= qq($description);
} else {
$output .= qq(<blockquote>\n</blockquote>\n);
}
if(!defined($opt{'L'})) {
$output .= qq(</li>\n\n);
}
return($output);
}
sub title_case {
my ($title) = (@_);
# based on Chapter 1.14.2, Perl Cookbook, 2nd ed.
our %nocap;
unless(keys %nocap) {
foreach my $w (qw(a an the and but or as at but by for
from in into of off on onto per to with)) {
$nocap{$w}++;
}
}
# put into lowercase if on stop list, else titlecase
$title =~ s/(\pL[\pL']*)/$nocap{$1} ? lc($1) : ucfirst(lc($1))/ge;
# last word guaranteed to cap
$title =~ s/^(\pL[\pL']*) /\u\L$1/x;
# first word guaranteed to cap
$title =~ s/ (\pL[\pL']*)$/\u\L$1/x;
# treat parenthesized portion as a complete title
$title =~ s/\( (\pL[\pL']*) /(\u\L$1/x;
$title =~ s/(\pL[\pL']*) \) /\u\L$1)/x;
# capitalize first word following colon or semi-colon
$title =~ s/ ( [:;] \s+ ) (\pL[\pL']* ) /$1\u\L$2/x;
return ($title);
}
./rss-since-scraper.sh
#!/bin/sh
# 2022-07-07
PATH=/usr/local/bin:/usr/bin:/bin
closure() {
test -d ${tmpdir} || exit 1
echo "Erasing temporary directory (${tmpdir}) and its files."
rm -f ${tmpdir}/feed-tmp.*
rmdir ${tmpdir}
}
cancel() {
echo "Cancelled."
closure
exit 2
}
# trap various signals to be able to erase temporary files
trap "cancel" 1 2 15
start=$(date -d '-2 days' +'%F')
file="/var/www/tuxmachines.org/htdocs/feeds.html"
umask 0002
echo '<div class="feedlist">' > $file
echo -e "<h2>Other Sites</h2>\n\n" >> $file
# set up a temporary directory for many temporary files
umask 0077
tmpdir=$(mktemp -d /tmp/feeds-tmp.XXXXXX)
# fetch feeds concurrently, each to a unique temporary file
while read feed; do
tmpfile=$(mktemp -p ${tmpdir} feed-tmp.XXXXXXX)
# use -o option because of permission problems with stdout and su
rss-since-scraper.pl -L -t -d $start -o ${tmpfile} ${feed} &
done <<EOF
$(grep -E -v '^#|^$' /usr/local/bin/rss-since-scraper.config)
EOF
wait
# concatenate all the temporary feed files into the destination file
cat ${tmpdir}/feed-tmp.* >> $file
echo '</div>' >> $file
chmod u=rw,g=rw,o=r $file
# clear signal trapping
trap - 1 2 15
# remove temporary files
closure
exit 0
./tm-add-entry-sql.pl
#!/usr/bin/perl
# 2022-07-08
use utf8;
use Getopt::Std;
use File::Temp qw(tempfile);
use File::Path qw(make_path);
use Unicode::Normalize qw(NFKD);
use HTML::TreeBuilder::XPath;
use HTML::FormatText;
use DBI qw(:sql_types);
use Term::ANSIColor;
use Capture::Tiny qw(capture capture_stdout);
use English;
use strict;
use warnings;
use lib "/usr/local/lib/perl5/";
use TuxMachines::ReadOn qw(ReadOn);
use open qw(:std :encoding(UTF-8));
# https://www.ietf.org/rfc/rfc2731.txt
our $VERBOSE = 0;
$OUTPUT_AUTOFLUSH=1;
our %opt;
getopts('a:d:m:s:t:hv', \%opt);
my $script = $0;
if (defined($opt{'h'})) {
&usage($script);
}
if (defined($opt{'v'})) {
$VERBOSE++;
}
my $author = &get_author($opt{'a'}); # get option or default to blank
my $date = &get_date( $opt{'d'}); # get option or default to current date
my $title = &get_title( $opt{'t'}); # get option or default to blank
my $desc = &get_desc( $opt{'m'}); # get option
my $slug = &get_slug( $opt{'s'} || 0, $title || 0);
my $dir = '';
my $dest = '';
my $done = 0;
my $checked = 0;
my $dbfile = "/var/www/tuxmachines.org/db/tm-static-site-generator.sqlite3";
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbfile", undef, undef,
{ AutoCommit => 0, RaiseError => 1 })
or die("Could not open database '$dbfile': $!\n");
$SIG{INT} = sub { &done($dbh) }; # quit gracefully
$SIG{HUP} = sub { &done($dbh) };
$SIG{TERM} = sub { &done($dbh) };
$SIG{STOP} = sub { &done($dbh) };
while (!$done) {
print qq(\nMetadata:\n);
if (!$opt{'d'}) {
$date = &read_date($date);
}
$dir = $date;
$dir =~ s|^([0-9]{4})([0-9]{2})([0-9]{2})|$1/$2/$3|;
$author = &read_author($author);
$title = &read_title($title);
$desc = &read_description($desc);
if (! $checked++ && !$slug && $title) {
# lll
$slug = $title;
$slug =~ s/\s+$//;
$slug =~ s/^\s+//;
$slug =~ s/\s+/_/g;
$slug =~ s/[[:punct:]]+/_/g; # harmonize with gemini
$slug =~ s/_+$//g;
$slug =~ s|/||g;
$slug =~ s/[^\w+-:'"?!]+//g;
# swap out diactricicals, gemini clients choke on them
$slug = NFKD($slug);
$slug =~ s/\p{NonspacingMark}//g;
if ($slug ne substr($slug,0,63)) {
print color('bold white');
print STDERR qq(Slug is too long. );
print STDERR qq(It should be less than 63 characters.\n);
print color('reset');
$checked = 0;
$slug = substr($slug,0,63);
} elsif (!$slug) {
print color('bold white');
print STDERR qq(Invalid title-based slug, );
print STDERR qq(check title or add slug\n);
print color('reset');
exit(1);
}
}
$slug = &read_slug($slug);
print "A=$author\n" if ($VERBOSE);
print "D=$date\n" if ($VERBOSE);
print "T=$title\n" if ($VERBOSE);
print "M=$desc\n" if ($VERBOSE);
print "S=$slug\n" if ($VERBOSE);
print qq(\n[y/N] );
my $i = <STDIN>;
chomp $i;
if ($i eq 'y' or $i eq 'Y') {
$done++;
} else {
next;
}
print "Waiting for database to unlock ...";
my ($recno, $ballast) = &get_next_available_recno($dbh, $date, $slug);
print "lock acquired\n";
if (!$recno) {
$done = 0;
$checked = 0;
next;
}
my $status;
if (!$slug) {
die("Slug missing"); # kludge for debugging
}
$status = &write_keys($dbh, $recno, $date, $slug, $ballast);
if($status) {
$status = &write_metadata($dbh, $recno, $title, $author, $date, $desc);
}
# lll
if ($status) {
my ($body, $rawtext) = &edit_body();
$rawtext = $title.' '.$rawtext;
$status = &write_body($dbh, $recno, $body, $rawtext);
}
if ($status == 1) {
print qq(Record added\n);
$done++;
} else {
$done = 0;
}
if (!$done) {
print "Rolling back\n";
$dbh->rollback;
}
}
# write the changes
$dbh->commit;
$dbh->disconnect;
exit(0);
sub usage {
my ($script) = (@_);
print "USAGE\n\n";
print "$script [hv] [-a author] [-d date] [-s slug] [-t title]\n\n";
print " -a author aka dc.creator\n";
print " -d date in YYYYMMDD format\n";
print " -m is the brief description for search engines to use";
print " -s the unique part of the file name\n";
print " -t the title to be used in the HTML document\n";
print " -v show debugging info\n";
print "\n";
print " -h show this message\n";
print "\n";
print "Only the -b option is obligatory. ";
print "The others will be prompted for if missing.\n";
exit(0);
}
sub get_author {
my ($author) = (@_);
# lll - validation / lookup table?
return($author);
}
sub get_date {
my ($date) = (@_);
if ($date) {
$date = $opt{'d'};
$date = &iso_8601_date($date);
if (!$date) {
print color('bold white');
print STDERR qq(Invalid date '), $opt{'d'}, qq('\n);
print color('reset');
exit(1);
}
}
if (!$date) {
my ($second,$minute,$hour,$day,$month,$year) = gmtime();
$year = sprintf("%04d", $year + 1900);
$month = sprintf("%02d", $month + 1);
$day = sprintf("%02d", $day);
$hour = sprintf("%02d", $hour);
$minute = sprintf("%02d", $minute);
$date = qq($year-$month-$day).'T'.qq($hour:$minute);
}
print qq(Date = $date\n) if $VERBOSE;
return($date);
}
sub get_title {
my ($title) = (@_);
if ($title) {
$title =~ s/^\s+//;
$title =~ s/\s+$//;
}
return($title);
}
sub get_desc {
my ($description) = (@_);
if ($description) {
$description =~ s/^\s+//;
$description =~ s/\s+$//;
}
return($description);
}
sub get_slug {
my ($slug, $title) = (@_);
print qq(1: $slug / $title\n) if ($VERBOSE);
# the \w does not handle unicode properly, no clue why
if ($slug) {
$slug =~ s/\s+$//;
$slug =~ s/^\s+//;
$slug =~ s/\s+/_/g;
$slug =~ s/[[:punct:]]+/_/g; # harmonize with gemini
$slug =~ s/_+$//;
while ($slug =~ s/__+/_/g) { 1 }
#
$slug =~ s/[^\w\+\-\:\[\]\{\}\\?\!\@\#\&\*\$\%]+//g;
# swap out diactricicals, gemini clients choke on them
$slug = NFKD($slug);
$slug =~ s/\p{NonspacingMark}//g;
$slug = substr($slug,0,63);
if (!$slug) {
print color('bold white');
print STDERR qq(Invalid slug '$slug'\n);
print color('reset');
exit(1);
}
}
if (!$slug && $title) {
$slug = $title;
print "SLUG=$slug\n";
$slug =~ s/\s+$//;
$slug =~ s/^\s+//;
$slug =~ s/\s+/_/g;
$slug =~ s|/+|_|g;
$slug =~ s/[[:punct:]]+/_/g; # harmonize with gemini
$slug =~ s/_+$//;
while ($slug =~ s/__+/_/g) { 1 }
$slug =~ s/[^\w\+\-\:\[\]\{\}\?\!\@\#\&\*\$\%]+//g;
if (!$slug) {
print color('bold white');
print STDERR qq(Invalid title-based slug, );
print STDERR qq(check title or add slug\n);
print color('reset');
exit(1);
}
}
print qq(2: $slug / $title\n) if ($VERBOSE);
return($slug);
}
sub read_author {
my ($author) = (@_);
my $done = 0;
while (!$done) {
print " Author: ";
if ($author) {
print "[$author] ";
if($opt{'a'}) {
print "\n";
}
}
my $new_author = '';
if (!$opt{'a'}) {
$new_author = <STDIN>;
chomp($new_author);
}
if($new_author) {
$author = $new_author;
}
# lll - lookup table or validation ?
if ($author) {
$done++;
} else {
print color('bold white');
print STDERR qq(Add author name or handle\n);
print color('reset');
}
}
return($author);
}
sub read_date {
my ($date) = (@_);
my $done = 0;
while (!$done) {
print qq( Date: );
if ($date) {
print qq([$date] );
}
my $d = <STDIN>;
chomp($d);
if ($d) {
($date) = ($d =~ m/^([0-9]{4}-[0-9]{2}-[0-9]{2})$/)
or
($date) = ($d =~ m/^([0-9]{4}[0-9]{2}[0-9]{2})$/);
if (!$date) {
print color('bold white');
print STDERR qq(Invalid date '), $d, qq('\n);
print color('reset');
} else {
$date =~ s/-//g;
$done++;
}
} elsif($date) {
$done++;
} else {
my ($second,$minute,$hour,$day,$month,$year) = gmtime();
$year = sprintf("%04d", $year + 1900);
$month = sprintf("%02d", $month + 1);
$day = sprintf("%02d", $day);
$hour = sprintf("%02d", $hour);
$minute = sprintf("%02d", $minute);
$date = qq($year-$month-$day).qq(T$hour:$minute);
}
}
return($date);
}
sub read_title {
my ($title) = (@_);
my $done = 0;
while (!$done) {
print qq( Title: );
if ($title) {
print qq([$title] );
}
my $t = <STDIN>;
chomp $t;
if ($t) {
$t =~ s/^\s+//;
$t =~ s/\s+$//;
$title = $t;
$done++;
} elsif ($title) {
$done++;
} else {
print color('bold white');
print STDERR qq(Invalid title '$t'\n);
print color('reset');
}
}
return($title);
}
sub read_description {
my ($description) = (@_);
my $done = 0;
while (!$done) {
print qq( Description: );
if ($description) {
print qq([$description] );
}
my $d = <STDIN>;
$d = Encode::encode( 'UTF-8', $d);
chomp $d;
if ($d) {
$d =~ s/^\s+//;
$d =~ s/\s+$//;
$description = $d;
$done++;
} elsif ($description) {
$done++;
} else {
print color('bold white');
print STDERR qq(Invalid description '$d'\n);
print color('reset');
}
}
return($description);
}
sub read_slug {
my ($slug) = (@_);
chomp($slug);
$slug =~ s/^\s+//;
my $done = 0;
while (!$done) {
print qq( Slug: );
if ($slug) {
print qq([$slug] );
}
my $s = <STDIN>;
chomp $s;
$s =~ s/^\s+//;
if ($s) {
$s =~ s/^\s+//;
$s =~ s/\s+$//;
$s =~ s/\s+/_/g;
$s =~ s|/+|_|g;
$slug =~ s/[[:punct:]]+/_/g; # harmonize with gemini
$slug =~ s/_+$//;
while ($s =~ s/__+/_/g) { 1 }
$s =~ s/[^\w\+\-\:\[\]\{\}\?\!\@\#\&\*\$\%]+//g;
$slug = $s;
$done++;
} elsif ($slug) {
$done++;
} else {
print color('bold white');
print STDERR qq(Invalid slug '$slug'\n);
print color('reset');
}
}
return($slug);
}
sub edit_body {
my ($tmppath) = (@_);
# use a temp file to get the XHTML over to the next script
my $validator = File::Temp->new( TEMPLATE => 'temp.XXXXX',
DIR => '/tmp',
SUFFIX => '.body1.tmp',
UNLINK => 1 );
my $editor = File::Temp->new( TEMPLATE => 'temp.XXXXX',
DIR => '/tmp',
SUFFIX => '.body2.tmp',
UNLINK => 1 );
my $tmpfile = $editor->filename;
-f $tmpfile && unlink($tmpfile); # clear the way for nano
my $vfile = $validator->filename;
-f $vfile && unlink($vfile); # clear the way for nano
my @cmd = ();
my $done = 0;
while (!$done) {
# edit body as tmpfile
@cmd = ('/usr/bin/nano', '--tabstospaces', $tmpfile);
system(@cmd) == 0
or die("editing '@cmd' failed: $?\n");
# don't allow empty body
if (!-e $tmpfile || -z $tmpfile) {
next;
}
# make a copy by reading on file and writing it to another name
open(my $tf, "<", $tmpfile)
or die("Could not open '$tmpfile' for reading\n");
my $lines = "";
while (my $line = <$tf>) {
$line =~ s| \& | \& |gm;
$lines .= $line;
}
close ($tf);
open(my $ov, ">", $vfile)
or die("Could not copy to '$vfile'\n");
# add paragraphs if there is no other XHTML markup
if ($lines =~ m/^(?!<[^>]+>).*$/m) {
$lines =~ s|^|<p>|;
$lines =~ s|\n\n+|</p>\n<p>\n|gm;
} elsif ($lines =~ m/^(?!<[^>]+>).*(?=\n\n)/m) {
$lines =~ s|^|<p>|gm;
}
print $ov $lines;
close ($ov);
# force conversion of the second file to XHTML using tidy
@cmd = ('/usr/bin/tidy', '-m', '-q', '--show-info', 'no',
'--preserve-entities', 'yes', '-utf8', '-asxml', $vfile);
# validate the second file now that it has become XHTML
my ($stdout, $stderr, $result) = capture { system(@cmd) };
@cmd = ('/usr/bin/tidy', '-q', '--show-info', 'no',
'--preserve-entities', 'yes', '-utf8', '-xml', $vfile);
($stdout, $result) = capture_stdout {system(@cmd)};
if ($result) {
print color('bold white');
print STDERR "HTML validation failed\n";
print STDERR "press RETURN to continue editing";
print color('reset');
my $i = <STDIN>;
$done = 0;
next;
} else {
# look for hotlinked images, report error if they are found
my $xhtml = HTML::TreeBuilder::XPath->new;
$xhtml->implicit_tags(1);
$xhtml->parse_file($vfile)
or die("Could not parse '$vfile' : $!\n");
my $error = 0;
for my $hotlink ($xhtml->findnodes('//img[starts-with(@src,"http")]')) {
$error++;
}
if ($error) {
print color('bold white');
print STDERR "Failure: image hotlinking present.";
print STDERR " Remove it to proceed.\n";
print STDERR "press RETURN";
print color('reset');
my $i = <STDIN>;
$done = 0;
next;
} else {
$done++;
}
$error = 0;
for my $alt ($xhtml->findnodes('//img[not(@alt) or @alt[not(string())]]')) {
$error++;
}
if ($error) {
print color('bold white');
print STDERR "Failure: missing or empty ALT attribute in IMG.";
print STDERR " Add it to proceed.\n";
print STDERR "press RETURN";
print color('reset');
my $i = <STDIN>;
$done = 0;
next;
} else {
$done++;
}
$xhtml->delete;
}
}
my $xhtml = HTML::TreeBuilder::XPath->new;
$xhtml->implicit_tags(1);
$xhtml->no_expand_entities(1);
open (my $xhtmlfile, "<", $vfile)
or die("Could not open '$vfile' for reading: $!\n");
$xhtml->parse_file($xhtmlfile)
or die("Could not parse '$vfile' : $!\n");
my $body = '';
my $rawtext = '';
my $formatter = HTML::FormatText->new(leftmargin => 0, rightmargin => 78);
for my $bd ($xhtml->findnodes('//body')) {
$rawtext = $formatter->format($bd);
$body = $bd->as_HTML('', ' ', {});
}
close($xhtmlfile);
close($editor);
close($validator);
# convert from object to plain text containing markup
$body = ReadOn($body);
# turn 'hair space' into a normal spaces
$body =~ s/\x{200a}/ /gm;
return($body, $rawtext);
}
sub get_next_available_recno {
my ($dbh, $date, $slug) = (@_);
my $recno;
$date =~ s/T.*//;
$date =~ s/-//g;
my $sth = $dbh->prepare('SELECT * from keys WHERE date=? AND slug=?
ORDER BY ballast DESC LIMIT 1');
$sth->execute($date,$slug);
my $ballast = 0;
if (my $row = $sth->fetchrow_hashref) {
$ballast = $row->{'ballast'} + 1;
# print color('bold white');
# print STDERR "Duplicate keys. Try a different slug.\n";
# print color('reset');
$sth->finish;
# return(0);
}
# get the next record number
$sth = $dbh->prepare('SELECT max(recno) from keys');
$sth->execute();
my $row = $sth->fetch;
$recno = $row->[0] ? $row->[0]+1 : 1;
$sth->finish;
# print "Next record = $recno\n";
return($recno, $ballast);
}
sub write_keys {
my ($dbh, $recno, $date, $slug, $ballast) = (@_);
$date =~ s/T.*//;
$date =~ s/-//g;
my $sth = $dbh->prepare('INSERT INTO
keys (recno, date, slug, ballast, written)
VALUES (?, ?, ?, ?, ?)');
eval {
$sth->execute($recno, $date, $slug, $ballast, 0);
};
if($@) {
$sth->finish;
$dbh->rollback;
print color('bold white');
print STDERR "slug not unique for that date\n";
print STDERR "try again with another slug or perhaps another title\n";
print color('reset');
return(0); # error
}
$sth->finish;
return($recno);
}
sub write_metadata {
my ($dbh, $recno, $title, $author, $date, $description) = (@_);
# this check is probably redundant now
$date = &iso_8601_date($date);
die unless $date;
my ($term, $value) = ('dc.title', $title);
my $sth = $dbh->prepare('INSERT INTO
metadata (recno, term, value)
VALUES(?, ?, ?)');
eval {
$sth->execute($recno, $term, $value);
};
if($@) {
$sth->finish;
$dbh->rollback;
die("Could not insert dc.title: $!\n");
}
($term, $value) = ('dc.date.created', $date);
eval {
$sth->execute($recno, $term, $value);
};
if($@) {
$sth->finish;
$dbh->rollback;
die("Could not insert dc.date.created: $!\n");
}
($term, $value) = ('dc.date.modified', $date);
eval {
$sth->execute($recno, $term, $value);
};
if($@) {
$sth->finish;
$dbh->rollback;
die("Could not insert dc.date.created: $!\n");
}
($term, $value) = ('dc.creator', $author);
eval {
$sth->execute($recno, $term, $value);
};
if($@) {
$sth->finish;
$dbh->rollback;
die("Could not insert dc.creator: $!\n");
}
($term, $value) = ('dc.description', $description);
eval {
$sth->execute($recno, $term, $value);
};
if($@) {
$sth->finish;
$dbh->rollback;
die("Could not insert dc.description: $!\n");
}
$sth->finish;
return(1);
}
sub write_body {
my ($dbh, $recno, $post, $rawtext) = (@_);
my $sth;
$sth = $dbh->prepare('INSERT INTO body (recno, body) VALUES(?, ?)');
eval {
$sth->execute($recno, $post);
};
if($@) {
$sth->finish;
$dbh->rollback;
exit(1); # error
}
$sth->finish;
$sth = $dbh->prepare('INSERT INTO rawtext (recno, fulltext) VALUES(?, ?)');
eval {
$sth->execute($recno, $rawtext);
};
if($@) {
$sth->finish;
$dbh->rollback;
exit(1); # error
}
$sth->finish;
return(1);
}
sub done {
my ($dbh) = (@_);
# undo all the changes
$dbh->rollback;
$dbh->disconnect;
print STDERR "quitting $!\n";
exit (0);
}
sub iso_8601_date {
my ($date) = (@_);
if ($date =~ m/^([0-9]{4})-([0-9]{2})-([0-9]{2})
T([0-9]{2}):([0-9]{2}):([0-9]{2})/x) {
1;
} elsif ($date =~ s/^([0-9]{4})-([0-9]{2})-([0-9]{2})$/$1-$2-$3T00:00/) {
1;
} elsif ($date =~ s/^([0-9]{4})-([0-9]{2})-([0-9]{2})T([0-9]{2}:[0-9]{2})$/$1-$2-$3T$4/) {
1;
} elsif ($date =~ m/^[0-9]{4}[0-9]{2}[0-9]{2}T[0-9]{2}:[0-9]{2}$/) {
1;
} else {
$date = 0;
}
return($date);
}
./tm-extract-posts-sql.pl
#!/usr/bin/perl
# 2022-09-07
# fetches posts from database and
# writes both XHTML and GemText versions in parallel
# to their default directories,
# unless the defauls are overridden with -g or -x
use utf8;
use Getopt::Long;
use Date::Calc qw/check_date Today/;
use DBI qw(:sql_types);
use File::Path qw(make_path);
use URI::Escape;
use URI;
use Date::Calc qw (Date_to_Time);
use POSIX qw (strftime);
use HTML::TreeBuilder::XPath;
use HTML::Entities qw/encode_entities_numeric decode_entities/;
use Encode; # decode is needed for HTML::TreeBuilder::XPath
use Data::Dumper qw/Dumper/;
use English;
use strict;
use warnings;
our $dbfile="/var/www/tuxmachines.org/db/tm-static-site-generator.sqlite3";
our $xhtml_path="/var/www/tuxmacihines.org/htdocs/n";
our $gemtext_path="/home/gemini/gemini/n";
our %opt;
our $VERBOSE = 0;
GetOptions ("all" => \$opt{'a'},
"date=s" => \$opt{'d'},
"force" => \$opt{'f'},
"gemini:s" => \$opt{'g'},
"help" => \$opt{'h'},
"since" => \$opt{'s'},
"unwritten" => \$opt{'u'},
"xhtml:s" => \$opt{'x'},
"verbose+" => \$opt{'v'},
);
my $script = $0;
if (defined($opt{'h'})) {
&usage($script);
}
if (defined($opt{'v'})) {
$VERBOSE = $opt{'v'};
}
if (defined($opt{'g'}) && !$opt{'g'}) {
print "\nGemText path missing\n\n";
&usage($script);
}
if (defined($opt{'x'}) && !$opt{'x'}) {
print "\nHTML path missing\n\n";
&usage($script);
}
my ($year, $month, $day) = &get_date($opt{'d'});
if ($opt{'s'}) {
print "Starting Date: $year/$month/$day\n" if ($VERBOSE);
} else {
print "Date: $year/$month/$day\n" if ($VERBOSE);
}
&extract_and_write($year,$month,$day);
exit(0);
sub usage {
my ($script) = (@_);
print "USAGE:\n\n";
print "$script [-ahfsuv] [-d date] [-g path] [-x path]\n\n";
print " -a, --all extract all records regardless of other settings\n";
print " -d, --date date as YYYYMMDD, defaults to today if missing\n";
print " -f, --force force all files, written or unwritten\n";
print " -g, --gemini override default destination path for GemText\n";
print " -s, --since also include all posts since the given date\n";
print " -u, --unwritten extract all unwritten records\n";
print " -x, --xhtml override default destination path for XHTML\n";
print " -v, --verbose show debugging info\n";
print "\n";
print " -h, --help show this message\n";
print "\n";
print "By default, only records which have not been extracted yet\n";
print "will be written. This can be overriden with the -f option.\n";
print "The -g and -x options can each be used to point to other paths\n";
print "and override the defaults:\n";
print " GemText path:\n\t$gemtext_path\n";
print " XHTML path:\n\t$xhtml_path\n";
print "The -a and the -u option are mutually exclusive and -a takes\n";
print "precedence.\n";
print "\n";
exit(0);
}
sub get_path {
my ($p,$default) = (@_);
my $path = $default;
if ($p) {
my @directories = reverse(split(m/\//, $p));
my @canonical_path = ();
while (@directories) {
my $dir = shift @directories;
if (!length($dir)) {
next;
}
if ($dir eq ".") {
next;
}
if ($dir eq "..") {
shift @directories;
next;
}
push @canonical_path, $dir;
}
$path = '/'.join("/", reverse @canonical_path);
if ($path eq '/') {
$path = $default;
}
if (-d $path) {
if (-w $path) {
return($path);
} else {
die("The directory '$path' is not writable\n");
}
} elsif (-e $path) {
die("The destination '$path' is not a directory\n");
} else {
die("The directory '$path' does not exist\n");
}
}
return($path);
}
# validate and return date from option XOR current date
sub get_date {
my ($d) = (@_);
my ($year, $month, $day);
my $date = '';
if ($d) {
if ( ($date) = ($d =~ m/^([0-9]{4}-[0-9]{2}-[0-9]{2})$/)
or
($date) = ($d =~ m/^([0-9]{4}[0-9]{2}[0-9]{2})$/)
) {
$date =~ s/-//g;
}
if (!$date) {
print STDERR qq(Invalid date '$d'\n);
exit(1);
}
($year,$month,$day) =
($date =~ m/^([0-9]{4})([0-9]{2})([0-9]{2})$/);
if (! check_date($year,$month,$day)) {
print STDERR qq(Invalid date '$date'\n);
exit(1);
}
}
if (!$date) {
($year,$month,$day) = Today(1); # get date GMT
$year = sprintf("%04d", $year);
$month = sprintf("%02d", $month);
$day = sprintf("%02d", $day);
}
return($year, $month, $day);
}
sub extract_and_write {
my ($year,$month,$day) = (@_);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbfile", undef, undef,
{ AutoCommit => 0, RaiseError => 1 })
or die("Could not open database '$dbfile': $!\n");
my $date = "$year-$month-$day";
# choose among option a, u, f, or s
my $sth = &query($date, $dbh);
my %record = ();
my $lowest = 0;
my $highest = 0;
while (my $data = $sth->fetchrow_hashref) {
my $recno = $data->{'recno'};
if (!$lowest) {
$lowest = $recno;
}
$highest = $recno;
$record{$recno}{'slug'} = $data->{'slug'};
$record{$recno}{'ballast'} = $data->{'ballast'};
$record{$recno}{'date'} = $data->{'date'};
$record{$recno}{'written'} = $data->{'written'};
}
$sth->finish;
if ($VERBOSE) {
print "HI: $highest\nLOW: $lowest\n";
}
# make an extended set for titles and links outside the written set
my %full_list = ();
# get the metadata for the first record before the retreived set
if ($lowest) {
my ($prev, $date, $slug, $ballast, $written) = &prev_recno($lowest);
if ($prev) {
$record{$prev}{'date'} = $date;
$record{$prev}{'slug'} = $slug;
$record{$prev}{'ballast'} = $ballast;
$record{$prev}{'written'} = $written;
($prev, $date, $slug, $ballast, $written) = &prev_recno($prev);
if ($prev) {
$full_list{$prev}{'date'} = $date;
$full_list{$prev}{'slug'} = $slug;
$full_list{$prev}{'ballast'} = $ballast;
$full_list{$prev}{'written'} = $written;
}
}
}
# get the metadata for the next record after the retrieved set
if ($highest) {
my ($next, $date, $slug, $ballast, $written) = &next_recno($lowest);
if ($next) {
$record{$next}{'date'} = $date;
$record{$next}{'slug'} = $slug;
$record{$next}{'ballast'} = $ballast;
$record{$next}{'written'} = $written;
($next, $date, $slug, $ballast, $written) = &next_recno($next);
if ($next) {
$full_list{$next}{'date'} = $date;
$full_list{$next}{'slug'} = $slug;
$full_list{$next}{'ballast'} = $ballast;
$full_list{$next}{'written'} = $written;
}
}
}
# cache previous/next data for each record in the set
for my $recno (sort {$a <=> $b} keys %record) {
my ($prev, $next, $slug, $ballast, $written);
($next, $date, $slug, $ballast, $written) = &next_recno($recno);
if ($next) {
$full_list{$recno}{'next'} = $next;
$full_list{$next}{'date'} = $date;
$full_list{$next}{'slug'} = $slug;
$full_list{$next}{'ballast'} = $ballast;
$full_list{$next}{'written'} = $written;
}
($prev, $date, $slug, $ballast, $written) = &prev_recno($recno);
if ($prev) {
$full_list{$recno}{'prev'} = $prev;
$full_list{$prev}{'date'} = $date;
$full_list{$prev}{'slug'} = $slug;
$full_list{$prev}{'ballast'} = $ballast;
$full_list{$prev}{'written'} = $written;
}
}
# third cycle: is this necessary? can title be collected earlier?
for my $recno (sort {$a <=> $b} keys %full_list) {
my $sth = $dbh->prepare('SELECT metadata.value, keys.ballast
FROM metadata JOIN keys
ON keys.recno=?
AND metadata.recno=keys.recno
AND metadata.term="dc.title"');
$sth->execute($recno) or die();
my $rec = $sth->fetchrow_hashref;
my $title = $rec->{'value'};
$title = encode_entities_numeric(decode_entities($title), '&');
$full_list{$recno}{'title'} = $title;
$sth->finish;
}
$dbh->disconnect;
if (!%record) {
print "No records or no unwritten records.\n\n";
return(0);
}
# it's probably faster to write both types than to track both separately
for my $recno (sort {$a <=> $b} keys %record) {
my ($path, $slug, $ballast, $date_created, $xhtml, $gemtext) = (0)x6;
# http / https
$path = '/var/www/tuxmachines.org/htdocs/n/';
$path = &get_path($opt{'x'}, $path);
print " XHTML Path: $path\n" if ($VERBOSE);
$slug = $record{$recno}{'slug'};
$ballast = $record{$recno}{'ballast'};
$date_created = $record{$recno}{'date'};
$date_created =~ s|^([0-9]{4})([0-9]{2})([0-9]{2})$|$1/$2/$3|;
$xhtml = &generate_xhtml($recno, \%full_list);
&write_xhtml($recno, "$path$date_created",
$slug, $ballast, $xhtml);
# gemini
$path = '/home/gemini/gemini/n/';
$path = &get_path($opt{'g'}, $path);
print " GemText Path: $path\n" if ($VERBOSE);
$slug = $record{$recno}{'slug'};
$ballast = $record{$recno}{'ballast'};
$date_created = $record{$recno}{'date'};
$date_created =~ s|^([0-9]{4})([0-9]{2})([0-9]{2})$|$1/$2/$3|;
$gemtext = &generate_gemtext($recno, \%full_list);
&write_gemtext($recno, "$path$date_created",
$slug, $ballast, $gemtext);
}
return(1);
}
sub query {
my ($date, $dbh) = (@_);
# $sth Statement handle object
my $sth;
my $query;
if ($opt{'a'}) {
$query = qq(SELECT recno,date,slug,ballast,written
FROM keys
WHERE recno>=1
ORDER BY recno ASC);
$sth = $dbh->prepare($query)
or die "prepare statement failed: $dbh->errstr()\n";
$sth->execute()
or die "execute statement failed: $dbh->errstr()\n";
} elsif ($opt{'u'}) {
$query = qq(SELECT recno,date,slug,ballast,written
FROM keys
WHERE recno>=1
AND written=0
ORDER BY recno ASC);
$sth = $dbh->prepare($query)
or die "prepare statement failed: $dbh->errstr()\n";
$sth->execute()
or die "execute statement failed: $dbh->errstr()\n";
} elsif ($opt{'f'}) {
if ($opt{'s'}) {
$query = qq(SELECT keys.recno,keys.date,keys.slug,
keys.ballast,keys.written
FROM keys
INNER JOIN metadata
ON keys.recno = metadata.recno
AND ( metadata.term="dc.date.modified"
OR
metadata.term="dc.date.created" )
AND substr(metadata.value,1,10)>=?
ORDER BY keys.recno ASC);
} else {
$query = qq(SELECT keys.recno,keys.date,keys.slug,
keys.ballast,keys.written
FROM keys
INNER JOIN metadata
ON keys.recno = metadata.recno
AND ( metadata.term="dc.date.modified"
OR
metadata.term="dc.date.created" )
AND substr(metadata.value,1,10)=?
ORDER BY keys.recno ASC);
}
$sth = $dbh->prepare($query)
or die "prepare statement failed: $dbh->errstr()\n";
$sth->execute($date)
or die "execute statement failed: $dbh->errstr()\n";
} else {
if ($opt{'s'}) {
$query = qq(SELECT keys.recno,keys.date,keys.slug,
keys.ballast,keys.written
FROM keys
INNER JOIN metadata
ON keys.recno = metadata.recno
AND written=0
AND ( metadata.term="dc.date.modified"
OR
metadata.term="dc.date.created" )
AND substr(metadata.value,1,10)>=?
ORDER BY keys.recno ASC);
} else {
$query = qq(SELECT keys.recno,keys.date,keys.slug,
keys.ballast,keys.written
FROM keys
INNER JOIN metadata
ON keys.recno = metadata.recno
AND written=0
AND ( metadata.term="dc.date.modified"
OR
metadata.term="dc.date.created" )
AND substr(metadata.value,1,10)=?
ORDER BY keys.recno ASC);
}
$sth = $dbh->prepare($query)
or die "prepare statement failed: $dbh->errstr()\n";
$sth->execute($date)
or die "execute statement failed: $dbh->errstr()\n";
}
if ($VERBOSE > 1) {
print "Main Query= $query\n";
}
return($sth);
}
sub next_recno {
my ($recno) = (@_);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbfile", undef, undef,
{ AutoCommit => 1, RaiseError => 1 })
or die("Could not open database '$dbfile': $!\n");
my $query = qq(SELECT recno, date, slug, ballast, written
FROM keys
WHERE recno >?
ORDER BY recno ASC LIMIT 1);
my $sth = $dbh->prepare($query)
or die();
$sth->execute($recno);
my ($next, $date, $slug, $ballast, $written) = (0) x 5;
if (my $record = $sth->fetchrow_hashref) {
$next = $record->{'recno'};
$date = $record->{'date'};
$slug = $record->{'slug'};
$ballast = $record->{'ballast'};
$written = $record->{'written'};
}
$sth->finish;
$dbh->disconnect;
return($next, $date, $slug, $ballast, $written);
}
sub prev_recno {
my ($recno) = (@_);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbfile", undef, undef,
{ AutoCommit => 1, RaiseError => 1 })
or die("Could not open database '$dbfile': $!\n");
my $query = qq(SELECT recno, date, slug, ballast, written
FROM keys
WHERE recno <?
ORDER BY recno DESC LIMIT 1);
my $sth = $dbh->prepare($query)
or die();
$sth->execute($recno);
my ($prev, $date, $slug, $ballast, $written) = (0) x 5;
if (my $record = $sth->fetchrow_hashref) {
$prev = $record->{'recno'};
$date = $record->{'date'};
$slug = $record->{'slug'};
$ballast = $record->{'ballast'};
$written = $record->{'written'};
}
$sth->finish;
$dbh->disconnect;
return($prev, $date, $slug, $ballast, $written);
}
sub generate_xhtml {
my $recno = shift;
my %data = %{$_[0]};
if ($VERBOSE) {
print "Writing XHTML $recno\n";
}
my ($head, $title, $author, $date_created, $date_modified) =
&fetch_head($recno);
$head = "<!-- $recno -->\n".$head;
my $prev_link = qq(<a name="prev">previous</a>);
if ($data{$recno}{'prev'}) {
my $prev = $data{$recno}{'prev'};
my $date = $data{$prev}{'date'};
my $title = $data{$prev}{'title'};
my $url = '';
if ($date) {
$date =~ s|^([0-9]{4})([0-9]{2})([0-9]{2})$|$1/$2/$3|;
my $slug = $data{$prev}{'slug'};
my $ballast = $data{$prev}{'ballast'};
if ($ballast) {
$url = "/n/$date/$slug.$ballast.shtml";
} else {
$url = "/n/$date/$slug.shtml";
}
} else {
die("Missing date\n");
}
$prev_link = qq(<a href="$url">$title</a>);
$head = $head.qq( <link rel="prev" href="$url" />\n);
}
my $next_link = qq(<a name="next">next</a>);
if ($data{$recno}{'next'}) {
my $next = $data{$recno}{'next'};
my $date = $data{$next}{'date'};
my $title = $data{$next}{'title'};
my $url = '';
if ($date) {
$date =~ s|^([0-9]{4})([0-9]{2})([0-9]{2})$|$1/$2/$3|;
my $slug = $data{$next}{'slug'};
my $ballast = $data{$next}{'ballast'};
if ($ballast) {
$url = "/n/$date/$slug.$ballast.shtml";
} else {
$url = "/n/$date/$slug.shtml";
}
} else {
die("Missing date\n");
}
$head = $head.qq( <link rel="next" href="$url" />\n);
$next_link = qq(<a href="$url">$title</a>);
}
# print $head,"\n";
my $pdate = &pdate($date_created);
if ($date_modified gt $date_created) {
$pdate .= ",<br />\nupdated ".&pdate($date_modified);
}
my $body = &fetch_xhtml_body($recno);
my $xhtml = &new_xhtml_document($title,$pdate,$author,
$prev_link,$next_link,$head,$body);
# print $head."\n".$body."\n\n";
return($xhtml);
}
sub fetch_head {
my $recno = shift;
my $title = '';
my $author = '';
my $date_created = '';
my $date_modified = '';
my @head = ();
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbfile", undef, undef,
{ AutoCommit => 0, RaiseError => 1 })
or die("Could not open database '$dbfile': $!\n");
my $query = qq(SELECT term,value FROM metadata WHERE recno=?);
my $sth = $dbh->prepare($query);
$sth->execute($recno);
while (my $record = $sth->fetchrow_hashref) {
# print Dumper($record);
my $term = $record->{'term'};
my $value = $record->{'value'};
if ($term eq 'dc.title') {
$title = $value;
push(@head, qq(<title>Tux Machines — $title</title>));
} elsif ($term eq 'dc.creator') {
$author = $value;
} elsif ($term eq 'dc.date.created') {
$date_created = $value;
} elsif ($term eq 'dc.date.modified') {
$date_modified = $value;
} elsif ($term eq 'slug') {
next;
}
push(@head, qq(<meta name="$term" content="$value" />));
}
my $head = " ".join("\n ", @head)."\n";
$sth->finish;
$dbh->disconnect;
return($head, $title, $author, $date_created, $date_modified);
}
sub fetch_xhtml_body {
my $recno = shift;
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbfile", undef, undef,
{ AutoCommit => 0, RaiseError => 1 })
or die("Could not open database '$dbfile': $!\n");
my $query = qq(SELECT body FROM body WHERE recno=?);
my $sth = $dbh->prepare($query);
$sth->execute($recno);
my $body = '';
while (my $record = $sth->fetchrow_hashref) {
$body = $record->{'body'};
}
$sth->finish;
$dbh->disconnect;
return($body);
}
sub new_xhtml_document {
my ($title,$pdate,$author,$prevlink,$nextlink,$head,$post) = (@_);
my $html = <<"EOHTML";
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
$head
<link rel="stylesheet" href="/CSS/tuxmachines.css"
media="screen" type="text/css" />
<link rel="alternate" type="application/atom+xml" href="/feed.xml"
title="Tux Machines" />
<link rel="shortcut icon" href="/Images/whitejazz_favicon_0.ico"
type="image/x-icon" /></head>
<body>
<!--#include virtual="/header.html"-->
<!--#include virtual="/feeds.html"-->
<div class="post">
<!--#include virtual="/navigation.html"-->
<div class="navigation2">
<ul>
<li>$prevlink</li>
<li>$nextlink</li>
</ul>
</div>
<h1>$title</h1>
<p class="author">posted by $author on $pdate<br /></p>
$post
</div>
<h1>Other Recent Tux Machines' Posts</h1>
<!--#include virtual="/latest-news.html"-->
<!--#include virtual="/footer.html"-->
</body>
</html>
EOHTML
return($html);
}
sub write_xhtml {
my ($recno, $path, $slug, $ballast, $xhtml) = (@_);
if (! &prepare_directory($path)) {
return(0);
}
my $file;
if ($ballast) {
$file = "$path/$slug.$ballast.shtml";
} else {
$file = "$path/$slug.shtml";
}
print " Fx: $file\n" if ($VERBOSE);
my $doc;
open($doc, '>', $file)
or die("Could not open '$file' for writing: $!\n");
print $doc $xhtml;
close($doc);
my $dbh2 = DBI->connect("dbi:SQLite:dbname=$dbfile", undef, undef,
{ AutoCommit => 0, RaiseError => 1 })
or die("Could not open database '$dbfile': $!\n");
my $query = qq(UPDATE keys
SET written=1
WHERE recno=?);
if ($VERBOSE > 2) {
print "Update recno = $recno\n";
print "Update query = $query\n";
print "Update dbfile = '$dbfile'\n";
}
my $sth2;
$sth2 = $dbh2->prepare($query)
or die($sth2->errstr."\n");
$sth2->execute($recno)
or die($sth2->errstr."\n");
$dbh2->commit;
$sth2->finish;
$dbh2->disconnect;
return(1);
}
sub prepare_directory {
my ($path) = (@_);
if ( -e $path) {
if ( ! -d $path) {
warn "Target already exists but is not a directory: '$path'\n";
return(0);
}
if ( ! -w $path) {
print STDERR "Target is not a writable: '$path'\n";
return(0);
}
# path exists and is writable
return(1);
} else {
make_path($path,{mode=>0775})
or die("Could not create path '$path' : $!\n");
print "Created directory '$path'\n" if ($VERBOSE);
return(1);
}
}
sub pdate {
my ($date) = (@_);
my ($pub_year,$pub_month,$pub_day) =
( $date =~ m/^([0-9]{4})-([0-9]{2})-([0-9]{2})T.*$/);
my $pub_date = Date_to_Time($pub_year, $pub_month, $pub_day, 0, 0, 0);
my $pdate = strftime("%b %d, %Y", gmtime($pub_date));
return($pdate);
}
sub generate_gemtext {
my $recno = shift;
my %data = %{$_[0]};
my $gemtext = '';
if ($VERBOSE) {
print "Writing GemText $recno\n";
}
my (undef, $title, $author, $date_created, $date_modified) =
&fetch_head($recno);
my $prev_link = '';
if ($data{$recno}{'prev'}) {
my $prev = $data{$recno}{'prev'};
my $date = $data{$prev}{'date'};
my $title = $data{$prev}{'title'};
my $url = '';
if ($date) {
$date =~ s|^([0-9]{4})([0-9]{2})([0-9]{2})$|$1/$2/$3|;
my $slug = $data{$prev}{'slug'};
my $ballast = $data{$prev}{'ballast'};
if ($ballast) {
$url = "/n/$date/$slug.$ballast.gmi";
} else {
$url = "/n/$date/$slug.gmi";
}
} else {
die("Missing date\n");
}
# $title = decode('UTF-8', $title);
# $url = decode('UTF-8', $url);
$prev_link = qq(=>\t$url\t$title);
}
my $next_link = '';
if ($data{$recno}{'next'}) {
my $next = $data{$recno}{'next'};
my $date = $data{$next}{'date'};
my $title = $data{$next}{'title'};
my $url = '';
if ($date) {
$date =~ s|^([0-9]{4})([0-9]{2})([0-9]{2})$|$1/$2/$3|;
my $slug = $data{$next}{'slug'};
my $ballast = $data{$next}{'ballast'};
if ($ballast) {
$url = "/n/$date/$slug.$ballast.gmi";
} else {
$url = "/n/$date/$slug.gmi";
}
} else {
die("Missing date\n");
}
# $title = decode('UTF-8', $title);
# $url = decode('UTF-8', $url);
$next_link = qq(=>\t$url\t$title);
}
my $pdate = &pdate($date_created);
if ($date_modified gt $date_created) {
$pdate .= ",\nupdated ".&pdate($date_modified);
}
my $body = &fetch_xhtml_body($recno);
$body = &xhtml_to_gemtext($body);
$body = encode('UTF-8', $body);
$gemtext = &new_gemtext_document($title,$pdate,$author,
$prev_link,$next_link,
$body);
return($gemtext);
}
sub xhtml_to_gemtext {
my ($post) = (@_);
# utf8 kludge for HTML::TreeBuilder::XPath
$post = decode('UTF-8', $post);
my $xhtml = HTML::TreeBuilder::XPath->new;
$xhtml->implicit_tags(1);
$xhtml->no_space_compacting(0);
$xhtml->parse($post)
or die("Could not parse post content : $!\n");
my %prefix = (
'h1' => "# ",
'h2' => "## ",
'h3' => "### ",
'h4' => "### ",
'h5' => "### ",
'h6' => "### ",
);
my $result;
# replace images with links to alt text or titles
for my $anchor ($xhtml->findnodes("//a[img]")) {
my $tmp = HTML::Element->new('~literal');
# lll
for my $img ($anchor->findnodes("./img")) {
my $alt;
my $title;
if (defined($img->attr('alt'))) {
$alt = $img->attr('alt');
}
if (defined($img->attr('src'))
|| defined($img->attr('title'))) {
my $src = $img->attr('src');
my $text = $img->attr('alt') || $img->attr('title') || '';
my $u = URI->new_abs($src, 'https://news.tuxmachines.org/');
my $url = $u->canonical;
my $link;
if ($text) {
$link = qq(\n=>\t$url\t↺ $text\n);
} else {
$link = qq(\n=>\t$url\n);
}
$tmp->push_content($link);
}
}
$anchor->replace_with($tmp);
}
# format headings, plus any links they might contain
foreach my $hn (1 .. 5) {
$hn = qq(h$hn);
for my $heading ($xhtml->findnodes(".//$hn")) {
my $h = "";
if (defined($prefix{$hn})) {
$h .= $prefix{$hn};
}
$h = qq(\n).$h.$heading->as_text.qq(\n\n);
my $tmp = HTML::Element->new('~literal');
$tmp->push_content($h);
for my $anchor ($heading->findnodes('./a[@href]')) {
my $link = &gemtext_link($anchor);
$tmp->push_content($link."\n");
}
$tmp->push_content("\n");
$heading->replace_with($tmp);
}
}
# ordered lists, only one layer deep
for my $ol ($xhtml->findnodes('//ol')) {
my $item = 1;
for my $li ($ol->findnodes('./li')) {
my $href ='';
my $new_li = HTML::Element->new('~literal');
$new_li->push_content("* $item ".$li->as_text."\n\n");
for my $anchor ($li->findnodes('./a[@href]')) {
my $link = &gemtext_link($anchor);
$new_li->push_content($link."\n");
}
$item++;
$li->replace_with($new_li);
}
}
# unordered lists, only one layer deep
for my $ul ($xhtml->findnodes('//ul')) {
for my $li ($ul->findnodes('./li')) {
my $href ='';
my $new_li = HTML::Element->new('~literal');
my $content = $li->as_text;
$content =~ s/\s+$//gm;
$content =~ s/^\s+//gm;
$new_li->push_content('* '.$content."\n\n");
for my $anchor ($li->findnodes('./a[@href]')) {
my $link = &gemtext_link($anchor);
$new_li->push_content($link."\n");
}
}
}
# block quotes, only one layer deep
for my $qq ($xhtml->findnodes('//blockquote')) {
my $href ='';
my $new_qq = HTML::Element->new('~literal');
my $as_text = $qq->as_text;
$as_text =~ s/^\s+//g;
$as_text =~ s/\s+$//g;
my $ppcount = 0;
for my $pp ($qq->findnodes('./p')) {
$ppcount++;
my $href ='';
my $new_pp = HTML::Element->new('~literal');
my $as_text = $pp->as_text;
$as_text =~ s/^\s+//g;
$as_text =~ s/\s+$//g;
$new_qq->push_content('> '.$as_text."\n\n");
for my $anchor ($pp->findnodes('.//a[@href]')) {
my $link = &gemtext_link($anchor);
$new_qq->push_content($link."\n");
}
$new_qq->push_content("\n");
}
if (!$ppcount) {
$new_qq->push_content('> '.$qq->as_text."\n\n");
}
for my $anchor ($qq->findnodes('.//a[@href]')) {
my $link = &gemtext_link($anchor);
$new_qq->push_content($link."\n");
}
$new_qq->push_content("\n");
$qq->replace_with($new_qq);
}
# any remaining paragraphs
for my $pp ($xhtml->findnodes('//p')) {
my $href ='';
my $new_pp = HTML::Element->new('~literal');
my $as_text = $pp->as_text;
$as_text =~ s/^\s+//g;
$as_text =~ s/\s+$//g;
$new_pp->push_content($as_text."\n\n");
for my $anchor ($pp->findnodes('./a[@href]')) {
my $link = &gemtext_link($anchor);
$new_pp->push_content($link."\n");
}
$new_pp->push_content("\n");
$pp->replace_with($new_pp);
}
# any remaining links
for my $anchor ($xhtml->findnodes('//a[@href]')) {
my $new_anchor = HTML::Element->new('~literal');
my $link = &gemtext_link($anchor);
$new_anchor->push_content($link."\n\n");
$anchor->replace_with($new_anchor);
}
$post = $xhtml->as_text;
$xhtml->destroy;
while ($post =~ s/\n\n\n/\n\n/gm) { 1 }
while ($post =~ s/^\*\s+#/#/gm) { 1 }
return($post);
}
sub gemtext_link {
my ($anchor) = (@_);
my $href = $anchor->attr('href');
my $text = $anchor->as_text;
chomp($text);
$text =~ s/^\s+//g;
if (defined($anchor->attr('class'))) {
if ($anchor->attr('class') eq 'readon') {
if (defined($anchor->attr('title'))) {
my $title = $anchor->attr('title') || 0;
if ($title) {
$text = "Read On: $title";
}
}
}
}
if ($href =~ m|^https?://|) {
$text = "↺ ".$text;
}
my $link = "=>\t$href\t$text";
return($link);
}
sub new_gemtext_document {
my ($title,$pdate,$author,$prevlink,$nextlink,$post) = (@_);
$title =~ s/\n/ /gm;
$title =~ s/\s+/ /g;
my $gemtext = <<"EOGEMTEXT";
Tux Machines
# $title
Posted by $author on $pdate
$nextlink
$prevlink
$post
=> / gemini.tuxmachines.org
EOGEMTEXT
return($gemtext);
}
sub write_gemtext {
my ($recno, $path, $slug, $ballast, $gemtext) = (@_);
my $file;
if ($ballast) {
$file = "$path/$slug.$ballast.gmi";
} else {
$file = "$path/$slug.gmi";
}
if (! &prepare_directory($path)) {
return(0);
}
if (! &is_file_writable($file)) {
warn("'$slug' could not be written\n");
return(0);
}
print " Fg: $file\n" if ($VERBOSE);
my $doc;
# open($doc, '>:utf8', $file)
open($doc, '>', $file)
or die("Could not open '$file' for writing: $!\n");
print $doc $gemtext;
close($doc);
return(1);
}
sub is_file_writable {
my ($file) = (@_);
# overwrite by default
if (-e $file) {
if (-f $file) {
if (-w $file) {
return(1);
} else {
warn("Destination '$file' is not writable\n");
return(0);
}
} else {
warn("Destination '$file' is not a regular file\n");
return(0);
}
} else {
return(1);
}
}
./tm-generate-feed.pl
#!/usr/bin/perl
use Getopt::Long;
use Date::Calc qw/check_date Today_and_Now Delta_DHMS/;
use DBI qw(:sql_types);
use XML::RSS; # RSS for HTML
use XML::Feed; # Atom for GemText
use URI::Escape;
use DateTime;
use Encode;
use HTML::Entities;
use Capture::Tiny qw(capture_stderr);
# use Data::Dumper qw(Dumper);
use English;
use warnings;
use strict;
our %opt;
our $VERBOSE = 0;
GetOptions ("xml|a" => \$opt{'a'},
"date|d=s" => \$opt{'d'},
"gemini" => \$opt{'g'},
"number=i" => \$opt{'n'},
"output=s" => \$opt{'o'},
"xhtml|x" => \$opt{'x'},
"verbose+" => \$opt{'v'},
"help" => \$opt{'h'},
);
if ($opt{'h'}) {
&usage($0);
}
if ($opt{'v'}) {
$VERBOSE = $opt{'v'};
}
my %metadata; # merged
my %metadata_date; # by date only
my %metadata_number; # last n records only
# get posts on or since the date provided
if ($opt{'d'}) {
my ($year, $month, $day) = get_date($opt{'d'});
%metadata_date = &fetch_metadata_date($year,$month,$day);
print "$year, $month, $day\n" if ($VERBOSE);
}
# get the latest N posts from the database
if($opt{'n'}) {
# force conversion to number
my $nth = $opt{'n'} + 0;
if (!$nth) {
warn("An integer is missing. One is needed when -n is used.");
exit(1);
}
%metadata_number = &fetch_metadata_nth($nth);
}
if (!$opt{'d'} && !$opt{'n'}) {
warn("Either a date -d or a quantity -n needs to be supplied.\n");
exit(1);
}
# create union of by-date and latest Nth posts by running through both
while ((my $k, my $v) = each(%metadata_date)) {
$metadata{$k} = $v;
}
while ((my $k, my $v) = each(%metadata_number)) {
$metadata{$k} = $v;
}
my $feed;
if (defined($opt{'a'})) {
if ($opt{'x'}) {
$feed = &make_http_rss_feed(%metadata);
} elsif ($opt{'g'}) {
$feed = &make_gemini_atom_feed(%metadata);
} else {
die("An option -g or -x must be provided\n");
}
} else {
if ($opt{'x'}) {
$feed = &make_xhtml_feed(%metadata);
} elsif ($opt{'g'}) {
$feed = &make_gemtext_feed(%metadata);
} else {
die("An option -g or -x must be provided\n");
}
}
# try to capture warnings sent to STDERR about "wide characters" here
my ($stderr, $result) = capture_stderr { print $feed };
exit(0);
# explain options and usage, then exit
sub usage {
my ($script) = (@_);
print "USAGE\n\n";
print "$script [options]\n\n";
print "Extract last n records and/or starting with the specified date and";
print " form either an native list or an Atom feed. Default is a native";
print " list.\n\n";
print " -a, --xml produce an XML-based RSS 2.0 feed for XHTML\n";
print " and produce an Atom feed for GemText\n";
print " -d, --date YYYYMMDD format, defaults to today if missing\n";
print " -f, --force force overwrite of pre-existing destination files\n";
print " -g, --gemtext make the either the gemtext list or Atom\n";
print " feed use Gemini URLs\n";
print " -n, --number take the last n records, instead of date\n";
print " -x, --xhtml make the either the definition list or Atom\n";
print " feed use HTTP(S) URLs\n";
print " -v, --verbose show debugging info\n";
print "\n";
print " -h, --help show this message\n";
print "\n";
print "Either -d or -n must be supplied, or both. If both are supplied";
print "then the result is the union of both sets.\n\n";
print "Example: \n";
print " $script -v -d 20220711 -s\n";
print "\n";
print "Example: \n";
print " $script -n 10\n";
exit(0);
}
# validate and return date from option XOR return current date
sub get_date {
my ($date) = (@_);
my ($year, $month, $day);
if ($date) {
($date) = ($opt{'d'} =~ m/^([0-9]{4}-[0-9]{2}-[0-9]{2})$/)
or
($date) = ($opt{'d'} =~ m/^([0-9]{4}[0-9]{2}[0-9]{2})$/);
$date =~ s/-//g;
if (!$date) {
print STDERR qq(Invalid date '), $opt{'d'}, qq('\n);
exit(1);
}
($year,$month,$day) =
($date =~ m/^([0-9]{4})([0-9]{2})([0-9]{2})$/);
if (! check_date($year,$month,$day)) {
print STDERR qq(Invalid date '), $opt{'d'}, qq('\n);
exit(1);
}
}
if (!$date) {
($year,$month,$day) = Today_and_Now(1); # get date GMT
$year = sprintf("%04d", $year);
$month = sprintf("%02d", $month);
$day = sprintf("%02d", $day);
}
return($year, $month, $day);
}
# fetch the posts made on or since YYYY MM DD
sub fetch_metadata_date{
my ($year,$month,$day) = (@_);
my $dbfile="/var/www/tuxmachines.org/db/tm-static-site-generator.sqlite3";
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbfile", undef, undef,
{ AutoCommit => 0, RaiseError => 1 })
or die("Could not open database '$dbfile': $!\n");
my %metadata;
my $sth;
my $recno;
# get the next record number, noting which records have been updated
my $query = qq(SELECT keys.recno AS recno, value, updated,
keys.ballast AS ballast, keys.slug AS slug
FROM keys
INNER JOIN (
SELECT created.recno, modified.value,
CASE
WHEN created.value<modified.value THEN 1
ELSE 0
END updated
FROM metadata created, metadata modified
WHERE created.recno=modified.recno
AND created.term="dc.date.created"
AND modified.term="dc.date.modified"
AND substr(modified.value,1,10)>=?) AS t3
ON t3.recno == keys.recno
ORDER BY t3.value DESC, recno DESC);
$sth = $dbh->prepare($query)
or die("prepare statement failed: $dbh->errstr()\n");
my $date = "$year-$month-$day";
print "Date $date\n" if ($VERBOSE);
$sth->execute($date)
or die("execute statement failed: $dbh->errstr()\n");
# Read the matching records and print them out
while (my $data = $sth->fetchrow_hashref) {
my $recno = $data->{'recno'};
my $ballast = $data->{'ballast'};
my $title = '';
my $author = '';
my $description = '';
$metadata{$recno}{'updated'} = $data->{'updated'};
if ($ballast) {
$metadata{$recno}{'url'} = $data->{'slug'}.'.'.$ballast;
} else {
$metadata{$recno}{'url'} = $data->{'slug'};
}
print "URL1 = ".$metadata{$recno}{'url'}."\n" if ($VERBOSE);
$metadata{$recno}{'updated'} = $data->{'updated'};
$query = qq(SELECT term,value FROM metadata WHERE recno=?);
my $sth2 = $dbh->prepare($query);
$sth2->execute($recno)
or die("execute statement failed: $dbh->errstr()\n");
my $date_created = '';
while (my $record = $sth2->fetchrow_hashref) {
my $term = $record->{'term'};
my $value = $record->{'value'};
if ($term =~ m/^dc\.date\.created/) {
$date_created = $value;
} elsif ($term eq 'dc.date.modified') {
$metadata{$recno}{'date'} = $value;
} elsif ($term eq 'dc.description') {
$metadata{$recno}{'description'} = $value;
} elsif ($term eq 'dc.title') {
$metadata{$recno}{'title'} = $value;
}
}
if ($VERBOSE > 1) {
print "DC=$date_created\n";
}
if (defined($metadata{$recno}{'url'})
&& $date_created) {
my $path = $date_created;
$path =~ s|^([0-9]{4})-([0-9]{2})-([0-9]{2})T.*$|$1/$2/$3|
or die("Could not validate '$path'\n");
$path = '/n/'.$path;
my $url = $path.'/'.$metadata{$recno}{'url'}.'.shtml';
$url =~ s|(?<!:)//|/|g;
$url = uri_escape($url, "?'\"");
$metadata{$recno}{'url'} = $url;
}
}
$sth->finish;
$dbh->disconnect;
return(%metadata);
}
# fetch the N most recent posts from the database
sub fetch_metadata_nth{
my ($nth) = (@_);
my $dbfile="/var/www/tuxmachines.org/db/tm-static-site-generator.sqlite3";
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbfile", undef, undef,
{ AutoCommit => 0, RaiseError => 1 })
or die("Could not open database '$dbfile': $!\n");
my %metadata;
my $sth;
# get the next record number, noting which records have been updated
my $query = qq(SELECT keys.recno AS recno, value, updated,
keys.ballast AS ballast, keys.slug AS slug
FROM keys
INNER JOIN (
SELECT created.recno, modified.value,
CASE
WHEN created.value<modified.value THEN 1
ELSE 0
END updated
FROM metadata created, metadata modified
WHERE created.recno=modified.recno
AND created.term="dc.date.created"
AND modified.term="dc.date.modified") AS t3
ON t3.recno == keys.recno
ORDER BY t3.value DESC, recno DESC
LIMIT ?);
$sth = $dbh->prepare($query)
or die("prepare statement failed: $dbh->errstr()\n");
$sth->execute($nth)
or die("execute statement failed: $dbh->errstr()\n");
# Read the matching records and print them out
while (my $data = $sth->fetchrow_hashref) {
my $recno = $data->{'recno'};
my $ballast = $data->{'ballast'};
my $title = '';
my $author = '';
my $description = '';
$metadata{$recno}{'updated'} = $data->{'updated'};
if ($ballast) {
$metadata{$recno}{'url'} = $data->{'slug'}.'.'.$ballast;
} else {
$metadata{$recno}{'url'} = $data->{'slug'};
}
print "URL2 = ".$metadata{$recno}{'url'}."\n" if ($VERBOSE);
$query = qq(SELECT term,value FROM metadata WHERE recno=?);
my $sth2 = $dbh->prepare($query);
$sth2->execute($recno)
or die("execute statement failed: $dbh->errstr()\n");
my $date_created = '';
while (my $record = $sth2->fetchrow_hashref) {
my $term = $record->{'term'};
my $value = $record->{'value'};
if ($term =~ m/^dc\.date\.created/) {
$date_created = $value;
} elsif ($term eq 'dc.date.modified') {
$metadata{$recno}{'date'} = $value;
} elsif ($term eq 'dc.description') {
$metadata{$recno}{'description'} = $value;
} elsif ($term eq 'dc.title') {
$metadata{$recno}{'title'} = $value;
} elsif ($term eq 'dc.creator') {
$metadata{$recno}{'author'} = $value;
}
}
if ($VERBOSE > 1) {
print "DC=$date_created\n";
}
if (defined($metadata{$recno}{'url'})
&& $date_created ) {
my $path = $date_created;
$path =~ s|^([0-9]{4})-([0-9]{2})-([0-9]{2})T.*$|$1/$2/$3|
or die("Could not validate '$path'\n");
$path = '/n/'.$path;
my $url = $path.'/'.$metadata{$recno}{'url'}.'.shtml';
$url =~ s|(?<!:)//|/|g;
$url = uri_escape($url, "?'\"");
$metadata{$recno}{'url'} = $url;
}
}
$sth->finish;
$dbh->disconnect;
return(%metadata);
}
sub make_http_rss_feed {
my (%protofeed) = (@_);
# make xml/rss feed for use over HTTP / HTTPS
my $http = "https://news.tuxmachines.org"; # hardcoded :(
# see https://validator.w3.org/feed/docs/error/InvalidRFC2822Date.html
my $dt = DateTime->now(time_zone=>'UTC');
my $d = $dt->strftime('%a, %d %b %Y %H:%M:%S %z');
# create an RSS 2.0 feed in UTF-8, without encoding non-ASCII entities
my $feed = XML::RSS->new(encoding=>'UTF-8',
output => "2.0",
encode_output => 0);
# chanel metadata
$feed->channel(title=>'Tux Machines',
link=>'https://tuxmachines.org/',
pubDate=>$d,
description => 'Do you waddle the waddle?',
language=>'en',
publisher=>'tuxmachines.org',
ttl => "300",
);
# add entries for each individual post in this feed
# sorted in a special sequence, floating recently updated posts to the top
for my $recno (sort {
&by_updated($protofeed{$b}{'date'}, $protofeed{$a}{'date'},
$protofeed{$b}{'updated'}, $protofeed{$a}{'updated'})
or $protofeed{$b}{'date'} cmp $protofeed{$a}{'date'}
or $b cmp $a
} keys %protofeed) {
# default to now, unless replaced with dc.date.modified
my $pubDate = $dt;
if ( my ($y, $m, $d, $H, $M) =
($protofeed{$recno}{'date'}
=~ m/^([0-9]{4})-([0-9]{2})-([0-9]{2})
T([0-9]{2}):([0-9]{2})/x)) {
$pubDate = DateTime->new(
year => $y,
month => $m,
day => $d,
hour => $H,
minute => $M,
time_zone => "UTC",
);
$pubDate = $pubDate->strftime('%a, %d %b %Y %H:%M:%S %z');
}
if (defined($protofeed{$recno}{'url'})) {
my ($url, $title, $description);
$url = $http.$protofeed{$recno}{'url'};
$url = uri_escape($url, "?'\"");
$title = decode('UTF-8',$protofeed{$recno}{'title'});
$title = encode_entities($title, '&');
$description = decode('UTF-8',$protofeed{$recno}{'description'});
$description = encode_entities($description, '&');
$feed->add_item(
link => $url,
title => $title,
description => $description,
pubDate => $pubDate,
# W3C Feed Validator requires an e-mail address, not a string
# author => $protofeed{$recno}{'author'},
);
}
}
return($feed->as_string);
}
sub make_gemini_atom_feed {
my (%protofeed) = (@_);
# make xml/atom feed for use over Gemini protocol
# see https://validator.w3.org/feed/docs/error/InvalidRFC2822Date.html
# see https://www.rfc-editor.org/rfc/rfc4287.html
my $dt = DateTime->now(time_zone=>'UTC');
my $feed = XML::Feed->new('Atom');
$feed->title('Tux Machines');
$feed->link('gemini://tuxmachines.org/');
$feed->self_link('gemini://tuxmachines.org/feed.xml');
$feed->base('gemini://news.tuxmachines.org/');
$feed->id('gemini://news.tuxmachines.org/');
$feed->tagline('Do you waddle the waddle?');
$feed->language('en');
$feed->modified($dt);
my $gemini = 'gemini://gemini.tuxmachines.org/'; # hardcoded :(
# add entries for each individual post in this feed
# sorted in a special sequence, floating recently updated posts to the top
for my $recno (sort {
&by_updated($protofeed{$b}{'date'}, $protofeed{$a}{'date'},
$protofeed{$b}{'updated'}, $protofeed{$a}{'updated'})
or $protofeed{$b}{'date'} cmp $protofeed{$a}{'date'}
or $b cmp $a
} keys %protofeed) {
if (defined($protofeed{$recno}{'url'})) {
my $entry = XML::Feed::Entry->new();
my $url = $gemini.$protofeed{$recno}{'url'};
# URL paths ought to map 1:1 from http to gemini
$url =~ s/\.shtml$/.gmi/;
$entry->id($url);
$entry->link($url);
$entry->title($protofeed{$recno}{'title'});
$entry->author($protofeed{$recno}{'author'});
if ( my ($y, $m, $d) = ($protofeed{$recno}{'date'}
=~ m/^([0-9]{4})-([0-9]{2})-([0-9]{2})/)) {
my $date = DateTime->new(year=>$y, month=>$m, day=>$d);
$entry->modified($date);
}
$entry->summary($protofeed{$recno}{'description'});
$feed->add_entry($entry);
}
}
# kludge for XML::Feed's hardcoded MIME Types
# this is brittle
my $f = $feed->as_xml;
$f =~ s|^(\s*<link [^>]+) (type="text/html")|$1 type="text/gemini"|gm;
return($f);
}
sub make_xhtml_feed {
my (%protofeed) = (@_);
# make XHTML document fragment listing posts in special sequence
my $feed = '';
$feed = qq(<div class="latest">\n);
$feed .= "<dl>\n";
my $count = 0;
my $old_updated = 0;
my $updated = 0;
for my $recno (sort {
&by_updated($protofeed{$b}{'date'}, $protofeed{$a}{'date'},
$protofeed{$b}{'updated'}, $protofeed{$a}{'updated'})
or $protofeed{$b}{'date'} cmp $protofeed{$a}{'date'}
or $b cmp $a
} keys %protofeed) {
if (defined($protofeed{$recno}{'url'})) {
$updated = &updated($protofeed{$recno}{'date'},
$protofeed{$recno}{'updated'});
if ($old_updated && !$updated) {
$feed .= "\n <dd> </dd>\n\n";
}
$old_updated = $updated;
$count++;
my $url = uri_escape($protofeed{$recno}{'url'},"?\"");
if ($updated) {
$feed .= ' <dt class="updated"><a href="'.$url
.'">'.$protofeed{$recno}{'title'}.'</a></dt>'."\n";
$feed .= ' <dd class="updated">'
.$protofeed{$recno}{'description'}."</dd>\n";
} else {
$feed .= ' <dt><a href="'.$url
.'">'.$protofeed{$recno}{'title'}.'</a></dt>'."\n";
$feed .= ' <dd>'.$protofeed{$recno}{'description'}."</dd>\n";
}
}
}
$feed .= "</dl>\n";
$feed .= "</div>\n";
if ($count) {
return($feed);
} else {
return(0);
}
}
sub make_gemtext_feed {
my (%protofeed) = (@_);
# make GemText document fragment listing links in special sequence
my $feed = '';
$feed = qq(\n);
my $count = 0;
my $old_updated = 0;
my $updated = 0;
for my $recno (sort {
&by_updated($protofeed{$b}{'date'}, $protofeed{$a}{'date'},
$protofeed{$b}{'updated'}, $protofeed{$a}{'updated'})
or $protofeed{$b}{'date'} cmp $protofeed{$a}{'date'}
or $b cmp $a
} keys %protofeed) {
if (defined($protofeed{$recno}{'url'})) {
$updated = &updated($protofeed{$recno}{'date'},
$protofeed{$recno}{'updated'});
if ($old_updated && !$updated) {
$feed .= "\n";
}
$old_updated = $updated;
$count++;
my $url = uri_escape($protofeed{$recno}{'url'},"?\"");
$url =~ s/\.\w+$/.gmi/;
$feed .= "=>\t".$url."\t".$protofeed{$recno}{'title'}."\n";
$feed .= ' '.$protofeed{$recno}{'description'}."\n\n";
}
}
$feed .= "\n";
if ($count) {
return($feed);
} else {
return(0);
}
}
sub by_updated {
my ($date1, $date2, $updated1, $updated2) = (@_);
$updated1 = &updated($date1, $updated1);
$updated2 = &updated($date2, $updated2);
return($updated1 cmp $updated2);
}
sub updated {
my ($date, $updated) = (@_);
my ($year1,$month1,$day1, $hour1,$min1,undef) =
($date =~ m/^([0-9]{4})-([0-9]{2})-([0-9]{2})T([0-9]{2}):([0-9]{2})/);
my ($year3,$month3,$day3, $hour3,$min3,undef) = Today_and_Now(1);
# calculate the time since the update in days, hours, minutes, seconds
my ($Dd,$Dh,$Dm,$Ds) = Delta_DHMS($year1,$month1,$day1, $hour1,$min1,00,
$year3,$month3,$day3, $hour3,$min3,00);
# if less than one day has passed
if ($Dd < 1) {
if ($updated) {
return(1);
}
}
return(0);
}
./tm-generate-gemtext-index.sh
#!/bin/sh PATH=/usr/local/bin:/usr/bin:/bin h=/home/gemini/gemini/ cat $h/index.template > $h/index.gmi date +"# Recent Posts as of %b %e, %Y%n" >> $h/index.gmi tm-generate-feed.pl -g -n 15 >> $h/index.gmi echo >> $h/index.gmi cat <<EOT >> $h/index.gmi ## Additional Information => /feed.xml Atom Feed for this Gemini capsule EOT cat ~gemini/hitclock >> $h/index.gmi exit 0
./tm-scale-and-process-image.pl
#!/usr/bin/perl -T
# 2022-09-05
use utf8;
use Getopt::Long;
use URI::Escape;
use URI;
use File::Temp qw(tempfile);
use Digest::SHA qw(sha256);
use File::Copy qw(copy);
use File::Basename qw/fileparse basename/;
use Image::Magick;
use Capture::Tiny qw(capture_stdout);
use Date::Calc qw/Today/;
use File::Path qw(make_path);
use DBI qw(:sql_types);
use English;
use strict;
use warnings;
our $VERBOSE = 0;
my $dbfile="/var/www/tuxmachines.org/db/tm-static-site-generator-img.sqlite3";
my $serverroot = '/var/www/tuxmachines.org';
my $documentroot = "$serverroot/htdocs";
my $dpath = &dpath('/i');
my $help = 0;
my $db = 0;
GetOptions ("database|d" => \$db,
"verbose+" => \$VERBOSE,
"help|h" => \$help,
);
# untaint the $PATH
$ENV{'PATH'} = '/usr/local/bin:/usr/bin:/bin';
# make sure the database file is there, but don't check schema
if ($db && ! -e $dbfile) {
&prepare_database($dbfile);
} elsif (! -e $dbfile) {
print "\nMissing database file \"$dbfile\"\n";
print "Try using the --database option to create it.\n\n";
&usage($0, $documentroot, $serverroot, $dpath);
exit(1);
} elsif ($db) {
print "Database file \"$dbfile\" already exists\n";
print "Ignoring the --database option\n";
}
if ($help) {
&usage($0, $documentroot, $serverroot, $dpath);
exit(0);
}
if ($#ARGV > 0) {
print "Too many command line arguments. Maybe quotes are missing?\n";
&usage($0, $documentroot, $serverroot, $dpath);
exit(1);
}
# a URL is obligatory
my $input = shift || 0;
if (! $input) {
&usage($0, $documentroot, $serverroot, $dpath);
exit(1);
}
# untaint the URL argument
my ($canonical,$dfile,$dext) = &cleaned_url($input);
# save the fetched image in a ephemeral file name
my $tmp = File::Temp->new( TEMPLATE => 'temp.XXXXX',
DIR => '/tmp',
SUFFIX => '.fetch.tuxmachines.img.tmp',
UNLINK => 1 );
my $tmpfile = &fetch_image($canonical, $tmp);
if (!$dext) {
($dext) = &verify_format($tmp);
}
my ($file, $dup) = &deduplicate($dbfile, $tmpfile, $documentroot,
$dpath, $dfile, $dext);
unlink($tmpfile)
or die("Could not remove '$tmpfile' from upload directory\n");
# retrieve an existing thumbnail from the db or make a new one
my ($thumbnail, $width, $height) = (0) x 3;
if (!$dup) {
# the main file is new, make a new thumbnail for it
($thumbnail, $width, $height) =
&make_thumbnail($dbfile, $documentroot, $file);
# print the matching XHTML markup
my $full = $file;
if ($thumbnail) {
my $thumb = $thumbnail;
$full =~ s/%/%25/g;
$thumb =~ s/%/%25/g;
my $link = qq(<a href="$full">).
qq(<img src="$thumb" width="$width" height="$height").
qq( alt="" /></a>\n);
print qq($link\n);
} else {
$full =~ s/%/%25/g;
my $link = qq(<a href="$full">).
qq(<img src="$full" width="$width" height="$height").
qq( alt="" /></a>\n);
print qq($link\n);
}
} else {
# the main file already exists
my ($width, $height) = (0, 0);
my ($f, $d, $s) = fileparse($file, qr/\.[^.*]*$/);
my $thumb = qq($d$f.thumbnail$s);
my $full = $file;
my $img;
if (-f $documentroot.$thumb) {
if ($VERBOSE) {
print "DUP Thumb $thumb\n";
}
# read width and height from the existing thumbnail file,
my $image = Image::Magick->new;
open(IMAGE, $documentroot.$thumb);
my $err = $image->Read(file=>\*IMAGE);
# || &clean_up($dbfile,$documentroot.$thumb);
if ($err) {
print "Error: $err\n";
exit(1);
}
close(IMAGE);
($width,$height) = $image->Get('width','height');
print "YYY= ($width,$height)\n";
# print the matching XHTML markup
$full =~ s/%/%25/g;
$thumb =~ s/%/%25/g;
my $link = qq(<a href="$full">).
qq(<img src="$thumb" width="$width" height="$height").
qq( alt="" /></a>);
print qq($link\n);
} else {
if ($VERBOSE) {
print "DUP no Thumb\n";
}
# create a thumbnail, or else remove all traces of failure
($thumbnail, $width, $height) =
&make_thumbnail($dbfile, $documentroot, $file);
if ($thumbnail) {
# print the matching XHTML markup
$full =~ s/%/%25/g;
$thumbnail =~ s/%/%25/g;
my $link = qq(<a href="$full">);
$link = $link . qq(<img src="$thumbnail" width="$width" ".
height="$height" alt="" /></a>);
print qq($link\n);
}
}
}
exit(0);
sub usage {
my ($script, $documentroot, $serverroot, $dpath) = (@_);
$script = basename($script);
print <<"EOH";
Usage:
$script [option] url
Run this script with the URL to an image file as the first
argument and it will create a thumbnail in the destination
directory, move the original there too, and then display the
relevant HTML markup to the image and it's thumbnail.
If the image is less than 250 pixels on its largest axis, then
no thumbnail will be generated and only the original will be used.
DocumentRoot:
$documentroot
ServerRoot:
$serverroot
Image Directory:
$documentroot$dpath
The aspect ratio will be preserved. Thumbnails for images in
landscape mode will have a maximum width of 250 and those in
portrait mode will have a maximum height of 250.
-d, --database initialize database if missing
-v increase debugging verbosity
-h this help text
EOH
return(1);
}
sub dpath {
my ( $dpath ) = (@_);
# append year and month to target path
my $gmt = 1;
my ($year,$month,$day) = Today($gmt);
$year = sprintf("%04d", $year);
$month = sprintf("%02d", $month);
$dpath = $dpath.'/'.$year.'/'.$month;
return($dpath);
}
sub cleaned_url {
my ($input, $documentroot) = (@_);
my $uri = URI->new($input);
my ($scheme, $host, $port, $path, $file) = (0) x 5;
$scheme = $uri->scheme || 0;
if ($scheme eq 'https' || $scheme eq 'http') {
$host = $uri->host || 0;
if (defined( $uri->path)) {
$path = $uri->path;
}
$port = $uri->port;
if ($path =~ m|\;.*$|
|| $path =~ m|[\000-\037]|) {
die("Bad URL path\n");
}
($file) = ($path =~ m#([^/\;]*)(\;|$)#);
} else {
warn("Unconfigured protocol: $scheme\n");
exit(1);
}
my $canonical = "$scheme://$host:$port$path";
if ($VERBOSE > 1) {
print qq(URI= $uri\n);
print qq( $scheme\n $host \t$port \t$path\n);
print qq( $canonical\n);
print qq( File: $file\n);
}
my ($dfile, $dext) = (0) x 2;
($dfile, $dext) = ($file =~ m/([^\.]*)\.?([^\.]*)$/);
$dext = lc($dext);
if ($VERBOSE > 1) {
print qq( F: $file\n);
print qq( P: $dpath\n);
print qq( N: $dfile\t$dext\n);
}
return($canonical, $dfile, $dext);
}
sub fetch_image {
my ($canonical, $tmp) = (@_);
# use a temp file while checking duplicate and such
my $tmpfile = $tmp->filename;
-f $tmpfile && unlink($tmpfile); # clear the way for wget
# wget does not acknowledge either self-signed or Let's Encrypt
my @cmd = ('wget', '--no-check-certificate', '-q',
'-O', $tmpfile, "$canonical");
system(@cmd) == 0
or die("system '@cmd' failed: $?\n");
return($tmpfile);
}
sub verify_format {
my ($tmp) = (@_);
my $dext = 'image';
open(IMAGE, $tmp);
my $image = Image::Magick->new;
$image->Read(file=>\*IMAGE);
close(IMAGE);
my ($id) = capture_stdout{ $image->Identify() };
my ($format) = ($id =~ m/Format:\s+(\w+)/);
$format = lc($format);
if ($VERBOSE > 1) {
print " O: ",$format,"\n";
}
if ($format eq 'jpeg'
or $format eq 'jpg'
or $format eq 'png'
or $format eq 'gif'
or $format eq 'avif'
or $format eq 'svg') {
return($format);
} else {
return($dext);
}
}
sub deduplicate {
my ($dbfile, $tmpfile, $documentroot, $dpath, $dfile, $dext) = (@_);
# primitive de-duplication by image SHA256 checksum
my $dup = 0;
my $sha = Digest::SHA->new('sha256_hex');
$sha->addfile($tmpfile);
my $fingerprint = $sha->hexdigest;
if ($VERBOSE) {
print qq( SHA256: $fingerprint\n);
}
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbfile", undef, undef,
{ AutoCommit => 0, RaiseError => 1 })
or die("Could not open database '$dbfile': $!\n");
my $query = qq(SELECT * FROM images WHERE sha256=?);
my $sth = $dbh->prepare($query)
or die("prepare statement failed: $dbh->errstr()\n");
$sth->execute($fingerprint)
or die("execute statement failed: $dbh->errstr()\n");
my $file = '';
my %data;
if (my $data = $sth->fetchrow_hashref) {
$file = $data->{'image'};
$sth->finish;
$dup = 1;
} else {
if (! -e $documentroot.$dpath) {
make_path($documentroot.$dpath,{mode=>0775})
or die("Could not create path '$documentroot.$dpath' : $!\n");
print "Created directory '$documentroot.$dpath'\n" if ($VERBOSE);
} elsif (! -d $documentroot.$dpath) {
die("'$documentroot.$dpath' exists but is not a directory.\n");
} elsif (! -w $documentroot.$dpath) {
die("Directory '$documentroot.$dpath' is not writable.\n");
}
my $newfile = $dpath.'/'.$dfile.'.'.$dext;
my $absfile = $documentroot.$dpath.'/'.$dfile.'.'.$dext;
my $count = 1;
if (-e $absfile) {
while (-e $absfile) {
$absfile = "$documentroot$dpath/$dfile.$count.$dext";
$newfile = "$dpath/$dfile.$count.$dext";
$count++;
}
}
my $epoch = time();
$query = qq(INSERT INTO images (sha256, epoch, image)
VALUES (?,?,?));
$sth=$dbh->prepare($query)
or die("prepare statement failed: $dbh->errstr()\n");
$sth->execute($fingerprint, $epoch, $newfile)
or die("execute statement failed: $dbh->errstr()\n");
if ($VERBOSE > 1) {
print qq(Query = $query\n);
print qq(FEN= $fingerprint, $epoch, $newfile\n);
}
copy($tmpfile, $documentroot.$newfile)
or die("Could not relocate from '$tmpfile' to '$documentroot$newfile'\n");
# double check group write for the shared file
my $mode = 0664;
chmod($mode, $newfile);
$sth->finish;
$dbh->commit;
$file = $newfile;
}
$dbh->disconnect;
return($file, $dup);
}
sub make_thumbnail {
my ($dbfile,$documentroot, $original_image) = (@_);
my ($destfile, $destpath, $destext) =
fileparse($original_image, qr/\.[^.*]*$/);
$destext =~ s/^\.//;
my $thumbnail = $destpath.$destfile.'.thumbnail.'.$destext;
my $image = Image::Magick->new;
open(IMAGE, $documentroot.$original_image);
my $err = $image->Read(file=>\*IMAGE);
# || &clean_up($dbfile,$documentroot.$original_image);
close(IMAGE);
if ($err) {
print "Error: $err\n";
exit(1);
}
my ($width,$height) = $image->Get('width','height');
my ($twidth, $theight);
if ($width > 250 || $height > 250) {
if ($width > $height) {
if ($width > 250) {
$theight = int($height * (250/$width));
$twidth = 250;
}
} else {
if ($height > 250) {
$twidth = int($width * (250/$height));
$theight = 250;
}
}
if ($destext ne 'svg') {
$image->Resize(width=>$twidth, height=>$theight);
$image->Write($documentroot.$thumbnail);
} else {
if (link($documentroot.$original_image,
$documentroot.$thumbnail)) {
if ($VERBOSE) {
print "Created hard link for thumbnail\n";
}
} else {
die("Could not hard link for thumbnail: \
'$documentroot.$original_image' -> '$documentroot.$thumbnail'\n");
}
}
# double-check the group write permissions for this shared file
my $mode = 0664;
chmod($mode, $documentroot.$thumbnail);
} else {
$thumbnail = 0;
}
return($thumbnail, $twidth, $theight);
}
sub clean_up {
my ($dbfile,$absfilepath) = (@_);
if (-f $absfilepath) {
my $sha = Digest::SHA->new('sha256_hex');
$sha->addfile($absfilepath);
my $fingerprint = $sha->hexdigest;
if (!$fingerprint) {
die("Could not fingerprint the original file: $absfilepath\n");
}
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbfile", undef, undef,
{ AutoCommit => 0, RaiseError => 1 })
or die("Could not open database '$dbfile': $!\n");
my $query = qq(DELETE FROM images WHERE sha256=?);
my $sth = $dbh->prepare($query)
or die("prepare statement failed: $dbh->errstr()\n");
$sth->execute($fingerprint)
or die("execute statement failed: $dbh->errstr()\n");
$sth->finish;
$dbh->commit;
$dbh->disconnect;
unlink($absfilepath);
}
die("Could not process image. File and db entry removed.\n");
}
sub prepare_database {
my ($dbfile) = (@_);
my ($dbpath, $dbext) = (0) x 2;
($dbfile, $dbpath, $dbext) =
fileparse($dbfile, qr/\.[^.*]*$/);
$dbext =~ s/^\.//;
if (! -e $dbpath) {
make_path($dbpath,{mode=>0775})
or die("Could not create path '$dbpath' : $!\n");
print "Created directory '$dbpath'\n" if ($VERBOSE);
} elsif (! -d $dbpath) {
die("'$dbpath' exists but is not a directory.\n");
} elsif (! -w $dbpath) {
die("Directory '$dbpath' is not writable.\n");
}
my $db = qq($dbpath/$dbfile.$dbext);
my $schema = qq(CREATE TABLE IF NOT EXISTS
images (sha256 varchar(64) unique not null,
epoch integer not null,
image varchar(256) not null));
my @cmd = ('echo', "'$schema'", '|', 'sqlite3', $db);
print join(' ', @cmd),"\n";
system(join(' ', @cmd)) == 0
or die("Could not create database '$db': $?\n");
$schema = qq(CREATE UNIQUE INDEX fingerprint on images (sha256));
@cmd = ('echo', "'$schema'", '|', 'sqlite3', $db);
print join(' ', @cmd),"\n";
system(join(' ', @cmd)) == 0
or die("Could not create index: $?\n");
print "\n";
return(1);
}
./tm-ssh-wrapper.pl
#!/usr/bin/perl -T
use URI;
use English;
use strict;
use warnings;
# Make %ENV safer
delete @ENV{qw(IFS CDPATH ENV BASH_ENV)};
# assign PATH explicitly
$ENV{PATH} = "/bin:/usr/bin:/usr/local/bin";
# print $ENV{'SSH_ORIGINAL_COMMAND'},"\n";
my $option = $ENV{'SSH_ORIGINAL_COMMAND'};
if (!$option) {
exit(1);
}
if ($option =~ m/^new$/i
|| $option =~ m/^add$/i ) {
exec("/usr/local/bin/add-and-refresh-from-db.sh");
} elsif ($option =~ m/^update\s+/) {
my ($url) = ($option =~ m/\s+(\S+)$/);
my $uri = URI->new($url)
or die();
my $scheme = $uri->scheme
or die();
my $host = $uri->host
or die();
my $path = $uri->path
or die();
if ($scheme ne 'http'
&& $scheme ne 'https' ){
die;
}
if ($host ne 'tuxmachines.org'
&& $host ne 'news.tuxmachines.org') {
die;
}
my $documentroot = '/var/www/tuxmachines.org/htdocs';
if (! -f "$documentroot/$path") {
die;
}
my $clean = "$scheme://$host$path";
exec('/usr/local/bin/update-and-refresh-from-db.sh',$clean);
}
exit(0);
./tm-update-entry-sql.pl
#!/usr/bin/perl
# 2022-07-17
use utf8;
use Getopt::Long;
use URI;
use DBI qw(:sql_types :utils);
use Date::Calc qw(Today_and_Now);
use File::Temp qw(tempfile);
use HTML::TreeBuilder::XPath;
use HTML::FormatText;
use Capture::Tiny qw(capture capture_stdout);
use Term::ANSIColor;
use English;
use strict;
use warnings;
use lib "/usr/local/lib/perl5/";
use TuxMachines::ReadOn qw(ReadOn);
my $url = "";
my $recno = 0;
my $delete = 0;
my $help = 0;
our $force = 0;
our $VERBOSE = 0;
GetOptions ("url=s" => \$url,
"recno=i" => \$recno,
"delete" => \$delete,
"force" => \$force,
"help" => \$help,
"verbose+" => \$VERBOSE,
)
or die("Error in runtime options\n");
my ($script) = ($0 =~ m/([^\/]+)$/);
my %metadata = ();
my $body = '';
my $rawtext = '';
my $dbfile = "/var/www/tuxmachines.org/db/tm-static-site-generator.sqlite3";
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbfile", undef, undef,
{ AutoCommit => 0, RaiseError => 1 })
or die("Could not open database '$dbfile': $!\n");
if (!$delete) {
if ($recno) {
%metadata = &get_metadata($dbh, $recno);
} elsif ($url) {
$recno = &get_recordnumber_from_url($dbh, $url)
or die("Record not found for '$url'\n");
if ($recno) {
%metadata = &get_metadata($dbh, $recno);
}
} else {
&usage($script);
}
$body = &get_body($dbh, $recno);
($body, $rawtext, %metadata) = &edit_record($body, %metadata);
if (&write_database($dbh, $recno, $body, $rawtext, %metadata)) {
print "Record Modified Successfully\n";
exit(0);
} else {
exit(1);
}
} else {
if (!$recno && $url) {
$recno = &get_recordnumber_from_url($dbh, $url)
or die("Record not found for '$url'\n");
} elsif (!$recno) {
&usage($script);
}
if (&delete_record($dbh, $recno)) {
print "Record $recno deleted\n";
}
}
my $rc = $dbh->disconnect or warn $dbh->errstr;
exit(0);
sub usage {
my ($script) = (@_);
print <<"EOU";
USAGE
$script [dfhv] --recno n | --url url
-r, --recno the record number in the SQL database
-u, --url the http(s) URL for the post in question
-d, --delete remove the record designated by record number or URL
-f, --force don't stop for any errors during, for deletion only
-v show debugging info
-h show this message
Either the record number or the URL is necessary, but not both. If both
are supplied, only the record number will be used. If the URL is used,
it will be parse for the date and the slug and those used to figure out
which record to work on.
EOU
exit(0);
}
sub get_recordnumber_from_url {
my ($dbh, $url) = (@_);
my $u = URI->new($url)
or die("Bad URL: $url\n");
my $scheme = $u->scheme;
my $host = $u->host;
my $path = $u->path;
if ($VERBOSE) {
print "S=$scheme\n";
print "H=$host\n";
print "P=$path\n";
}
my $query;
my $keydate;
my ($year, $month, $day, $slug, $ballast);
if ( ($year, $month, $day, $slug, $ballast) =
( $path =~ m|^/n/([0-9]{4})/([0-9]{2})/([0-9]{2})/
(.*)\.([0-9]+)\.shtml$|x ) ) {
$keydate = $year.$month.$day;
$query = qq(SELECT recno FROM keys
WHERE date="$keydate"
AND slug="$slug" AND ballast="$ballast");
} elsif ( ($year, $month, $day, $slug) =
( $path =~ m|^/n/([0-9]{4})/([0-9]{2})/([0-9]{2})/
(.*)\.shtml$|x ) ) {
$keydate = $year.$month.$day;
$query = qq(SELECT recno FROM keys
WHERE date="$keydate"
AND slug="$slug");
}
# get the next record number
my $sth = $dbh->prepare($query);
$sth->execute();
my $row = $sth->fetch;
my $recno = $row->[0] ? $row->[0] : 0;
$sth->finish;
return($recno);
}
sub get_metadata {
my ($dbh, $recno) =(@_);
my %metadata = ();
# get the next record number
my $query = qq(SELECT * FROM metadata WHERE recno=$recno);
my $sth = $dbh->prepare($query);
$sth->execute();
while (my $row = $sth->fetchrow_hashref) {
my $term = $row->{'term'};
my $value = $row->{'value'};
push(@{$metadata{$term}}, $value);
}
$sth->finish;
return(%metadata);
}
sub get_body {
my ($dbh, $recno) = (@_);
my $body = "";
# get the next record number
my $query = qq(SELECT body FROM body WHERE recno=$recno);
my $sth = $dbh->prepare($query);
$sth->execute();
my $row = $sth->fetchrow_hashref;
$body = $row->{'body'} || 0;
$sth->finish;
return($body);
}
sub edit_record {
my ($body, %metadata) = (@_);
my $done = 0;
while (!$done) {
for my $k (sort keys %metadata) {
if ($k =~ m/^dc\.date\.created/) {
print "$k [",join(';', @{$metadata{$k}}),"] \n";
} elsif ($k =~ m/^dc\.date\.modified/) {
# lll
my ($year,$month,$day, $hour,$min,$sec) = Today_and_Now(1);
my $date = sprintf("%04d-%02d-%02dT%02d:%02d",
$year,$month,$day,$hour,$min);
@{$metadata{$k}}[0]= $date;
print "$k [",join(';', @{$metadata{$k}}),"] \n";
} else {
print "$k [",join(';', @{$metadata{$k}}),"] ";
my $v = <STDIN>;
chomp($v);
if ($v) {
# 0x3B is a semicolon
@{$metadata{$k}} = split(/\{x3b}/, $v);
}
}
}
print "\nOK? [y / N] ";
my $i = <STDIN>;
chomp $i;
if ($i eq 'y' or $i eq 'Y') {
$done++;
} else {
next;
}
}
# use a temp file to get the XHTML over to the next script
my $editor = File::Temp->new( TEMPLATE => 'temp.XXXXX',
DIR => '/tmp',
SUFFIX => '.tm.body1.tmp',
UNLINK => 1 );
my $validator = File::Temp->new( TEMPLATE => 'temp.XXXXX',
DIR => '/tmp',
SUFFIX => '.tm.body2.tmp',
UNLINK => 1 );
my $tmpfile = $editor->filename;
-f $tmpfile && unlink($tmpfile); # clear the way for nano
my $vfile = $validator->filename;
-f $vfile && unlink($vfile); # clear the way for nano
open (my $tf, ">", $tmpfile)
or die("Could not open '$tmpfile' for writing: $!\n");
print $tf $body;
close($tf);
my @cmd = ();
$done = 0;
while (!$done) {
@cmd = ('/usr/bin/nano', '--tabstospaces', $tmpfile);
system(@cmd) == 0
or die("editing '@cmd' failed: $?\n");
open(my $tf, "<", $tmpfile)
or die("Could not open '$tmpfile' for reading\n");
my $lines = "";
while (my $line = <$tf>) {
$line =~ s| \& | \& |gm;
$lines .= $line;
}
close ($tf);
open(my $ov, ">", $vfile)
or die("Could not copy to '$vfile'\n");
if ($lines =~ m/^(?!<[^>]+>).*(?=\n\n)/m) {
# or $lines =~ m/^(?!<p[^>]+>).*(?=\n\n)/m ) {
$lines =~ s|^|<p>|;
$lines =~ s|\n\n+|</p>\n<p>\n|gm;
}
print $ov $lines;
close ($ov);
@cmd = ('/usr/bin/tidy', '-m', '-q',
'--preserve-entities', 'yes', '-utf8', '-asxml', $vfile);
my ($stdout, $stderr, $result) = capture { system(@cmd) };
@cmd = ('/usr/bin/tidy', '-q', '--show-info', 'no',
'--preserve-entities', 'yes', '-utf8', '-xml', $vfile);
($stdout, $result) = capture_stdout { system(@cmd) };
if ($result) {
print STDERR "HTML validation failed\n";
print STDERR "press RETURN to continue editing";
my $i = <STDIN>;
} else {
# look for hotlinked images, report error if they are found
my $xhtml = HTML::TreeBuilder::XPath->new;
$xhtml->implicit_tags(1);
$xhtml->parse_file($vfile)
or die("Could not parse '$vfile' : $!\n");
my $error = 0;
for my $hotlink ($xhtml->findnodes('//img[starts-with(@src,"http")]')) {
$error++;
}
if ($error) {
print STDERR "Failure: image hotlinking present. ";
print STDERR "Remove it to proceed.\n";
print STDERR "press RETURN";
my $i = <STDIN>;
} else {
$done++;
}
$error = 0;
for my $alt ($xhtml->findnodes('//img[not(@alt) or @alt[not(string())]]')) {
$error++;
}
if ($error) {
print color('bold white');
print STDERR "Failure: missing or empty ALT attribute in IMG.";
print STDERR " Add it to proceed.\n";
print STDERR "press RETURN";
print color('reset');
my $i = <STDIN>;
$done = 0;
next;
} else {
$done++;
}
$xhtml->delete;
}
}
my $xhtml = HTML::TreeBuilder::XPath->new;
$xhtml->implicit_tags(1);
$xhtml->no_expand_entities(1);
open (my $xhtmlfile, "<", $vfile)
or die("Could not open '$vfile' for reading: $!\n");
$xhtml->parse_file($xhtmlfile)
or die("Could not parse content from '$vfile' : $!\n");
$body = '';
my $rawtext = '';
my $formatter = HTML::FormatText->new(leftmargin => 0, rightmargin => 78);
for my $b ($xhtml->findnodes('//body')) {
$rawtext = $formatter->format($b);
$body = $b->as_HTML('', ' ', {});
}
$xhtml->delete;
close($xhtmlfile);
close($editor);
close($validator);
# modify the ReadOn link if needed
$body = ReadOn($body);
# turn 'hair space' into a normal space
$body =~ s/\x{200a}/ /gm;
return($body, $rawtext, %metadata);
}
sub write_database {
my ($dbh, $recno, $body, $rawtext, %metadata) = (@_);
my $query = "";
my $sth = $dbh->prepare('DELETE FROM metadata WHERE recno=?')
or die("Could not prepare deletion\n");
eval {
$sth->execute($recno);
};
if($@) {
$sth->finish;
$dbh->rollback;
exit(1); # error
}
$sth = $dbh->prepare('UPDATE body SET body=? WHERE recno=?');
eval {
$sth->execute($body, $recno);
};
if($@) {
$sth->finish;
$dbh->rollback;
exit(1); # error
}
$rawtext = join(' ',@{$metadata{'dc.title'}}).' '.$rawtext;
$sth = $dbh->prepare('UPDATE rawtext SET fulltext=? WHERE recno=?');
eval {
$sth->execute($rawtext, $recno);
};
if($@) {
$sth->finish;
$dbh->rollback;
exit(1); # error
}
$sth = $dbh->prepare('INSERT INTO metadata (recno, term, value)
VALUES (?, ?, ?)');
for my $k (sort keys %metadata) {
for my $v (@{$metadata{$k}}) {
eval {
$sth->execute($recno, $k, $v);
};
if($@) {
$sth->finish;
$dbh->rollback;
die("Could not reinsert metadata: $!\n");
}
}
}
$sth->finish;
$dbh->commit;
return(1);
}
sub delete_record {
my ($dbh, $recno) = (@_);
my $query = "";
my $sth = $dbh->prepare('SELECT date,slug,ballast FROM keys WHERE recno=?')
or die("Could not prepare fetch date and slug from keys table\n");
eval {
$sth->execute($recno);
};
if($@) {
warn("Could not fetch date and slug from keys table\n");
$sth->finish;
$dbh->rollback;
exit(1); # error
}
my $row = $sth->fetchrow_hashref;
if (!$sth->rows) {
die("No record \x{23}$recno found\n");
}
if ($sth->err) {
print $sth->errstr,"\n";
}
my $date = $row->{'date'} || 0;
my $slug = $row->{'slug'} || 0;
my $ballast = $row->{'ballast'} || 0;
if (!$date || !$slug) {
die("No working info from record number. Does it exist?\n");
}
my $file = $date;
$file =~ s|^([0-9]{4})([0-9]{2})([0-9]{2})$|$1/$2/$3|;
if ($ballast) {
$file = "/var/www/tuxmachines.org/htdocs/n/$file/$slug.$ballast.shtml";
} else {
$file = "/var/www/tuxmachines.org/htdocs/n/$file/$slug.shtml";
}
my $ferror = 0;
if (-f $file) {
if (-w $file) {
if (unlink($file)) {
if ($VERBOSE) {
print "File '$file' deleted. Directory remains.\n";
}
} else {
$ferror++;
warn("File '$file' was not unlinked\n");
if (!$force) {
warn(" --force not enabled. Quitting.\n");
exit(1);
}
}
} else {
$ferror++;
warn("File '$file' is not writable\n");
exit(1);
}
} else {
$ferror++;
warn("File '$file' does not exist\n");
if (!$force) {
warn(" --force not enabled. Quitting.\n");
exit(1);
}
}
my $derror = 0;
$sth = $dbh->prepare('DELETE FROM metadata WHERE recno=?')
or die("Could not prepare deletion from metadata table\n");
eval {
$sth->execute($recno);
};
if($@) {
$derror++;
warn("Could not delete from metadata table\n");
$sth->finish;
$dbh->rollback;
if (!$force) {
exit(1); # error
}
}
$sth = $dbh->prepare('DELETE FROM body WHERE recno=?')
or die("Could not prepare deletion from body\n");
eval {
$sth->execute($recno);
};
if($@) {
$derror++;
warn("Could not delete from body table\n");
$sth->finish;
$dbh->rollback;
if (!$force) {
exit(1); # error
}
}
$sth = $dbh->prepare('DELETE FROM rawtext WHERE recno=?')
or die("Could not prepare deletion from rawtext\n");
eval {
$sth->execute($recno);
};
if($@) {
$derror++;
warn("Could not delete from rawtext table\n");
$sth->finish;
$dbh->rollback;
if (!$force) {
exit(1); # error
}
}
$sth = $dbh->prepare('DELETE FROM keys WHERE recno=?')
or die("Could not prepare deletion from keys\n");
eval {
$sth->execute($recno);
};
if($@) {
$derror++;
warn("Could not delete from keys table\n");
$sth->finish;
$dbh->rollback;
if (!$force) {
exit(1); # error
}
}
if ($ferror) {
warn($ferror, " file errors reported\n");
}
if ($derror) {
warn($derror, " database errors reported\n");
}
$sth->finish;
$dbh->commit;
return(1);
}
sub iso_8601_date {
my ($date) = (@_);
if ($date =~ s/^([0-9]{4})([0-9]{2})([0-9]{2})$/$1-$2-$3T00:00/) {
1;
} elsif ($date =~ s/^([0-9]{4})-([0-9]{2})-([0-9]{2})$/$1-$2-$3T00:00/) {
1;
} elsif ($date =~ m/^[0-9]{4}[0-9]{2}[0-9]{2}T[0-9]{2}:[0-9]{2}$/) {
1;
} else {
$date = 0;
}
return($date);
}
./update-and-refresh-from-db.sh
#!/bin/sh # 2022-07-26 PATH=/usr/local/bin:/usr/bin:/bin case $USER in 'tuxmachines') author='Tux Machines' ;; 'roy') author='Roy Schestowitz' ;; 'rianne') author='Rianne Schestowitz' ;; 'marius') author='Marius Nestor' ;; *) author=$USER ;; esac # add a record tm-update-entry-sql.pl -u $@ # update both the XHTML and Gemtext hierarchies refresh-site-from-db.sh exit 0
./tm-static-site-generator.sqlite3.schema
CREATE TABLE metadata(recno integer, term varchar(25) not null,value varchar(256) not null,constraint fk_recno foreign key (recno) references "body_old" (recno) on delete cascade);
CREATE TABLE IF NOT EXISTS "body"(recno integer primary key unique, body text not null);
CREATE TABLE IF NOT EXISTS "keys" (recno integer not null unique, written integer default 0 not null, date varchar(8) not null, ballast integer, slug varchar(256) not null, unique (date, slug, ballast), foreign key (recno) references "body" (recno));
CREATE TABLE rawtext(recno integer primary key unique, fulltext text not null);
CREATE TRIGGER rawtext_insert AFTER INSERT ON rawtext BEGIN
INSERT INTO data(rowid, fulltext) VALUES (new.recno, new.fulltext);
END;
CREATE TRIGGER rawtext_delete AFTER DELETE ON rawtext BEGIN
INSERT INTO data(data, rowid, fulltext) VALUES('delete', old.recno, old.fulltext);
END;
CREATE TRIGGER rawtext_update AFTER UPDATE ON rawtext BEGIN
INSERT INTO data(data, rowid, fulltext) VALUES('delete', old.recno, old.fulltext);
INSERT INTO data(rowid, fulltext) VALUES (new.recno, new.fulltext);
END;
CREATE VIRTUAL TABLE data USING FTS5(fulltext, content=rawtext, content_rowid=recno)
/* data(fulltext) */;
CREATE TABLE IF NOT EXISTS 'data_data'(id INTEGER PRIMARY KEY, block BLOB);
CREATE TABLE IF NOT EXISTS 'data_idx'(segid, term, pgno, PRIMARY KEY(segid, term)) WITHOUT ROWID;
CREATE TABLE IF NOT EXISTS 'data_docsize'(id INTEGER PRIMARY KEY, sz BLOB);
CREATE TABLE IF NOT EXISTS 'data_config'(k PRIMARY KEY, v) WITHOUT ROWID;
./.gitignore
n/ feed.xml feeds.html latest-news.html
./tuxmachines.css
@charset "utf-8";
body {
z-index: 10;
font-family: Tahoma, Verdana, Segoe, sans-serif;
margin: 0;
background-color: #444;
text-align: left;
width: 100%;
background-image: url("/Images/F1F1F1E9E9E9CACACAFFFFFF_108.png");
padding-left: 0em;
padding-right: 0em;
}
@keyframes animateDown {
0% {
opacity: 0;
transform: translatey(-15px);
}
100% {
opacity: 1;
transform: translatey(0);
}
}
details {
margin-left: 2em;
margin-right: 2em;
}
details[open] {
animation: animateDown 0.2s linear forwards;
}
h1.recent {
margin-left: 0.5em;
margin-right: 0.5em;
}
div.header {
padding-top: 0;
background-color: #5d7eb1;
background-image: url("/Images/top3.jpg");
background-repeat: no-repeat;
min-height: 128px;
}
div.header img {
float: left;
z-index: -1;
}
div.header h1 {
margin-left: 4em;
text-shadow: -1px 1px #444, 2px 1px #444, 2px 2px #444, -2px 2px #444;
text-transform: uppercase;
}
div.header p {
margin-left: 4em;
font-style: italic;
}
div.latest {
font-size: 80%;
}
div.latest dl {
padding-left: 2em;
}
div.latest dt.updated:after {
content: " ☚ updated today";
font-size: 75%;
font-style: italic;
}
div.latest dd {
font-style: italic;
}
div.latest dl dt a:link {
background-image: linear-gradient(#0000ee, #0000ee);
background-size: 0% 0.1em;
background-position-y: 100%;
background-position-x: 100%;
background-repeat: no-repeat;
transition: background-size 0.2s ease-in-out;
}
div.latest dl dt a:hover {
background-size: 100% 0.1em;
background-position-x: 0%;
}
div.latest dl dt:hover + dd {
font-style: normal;
}
div.latest dl dt:hover + dd:after {
content: " •";
}
h2.latest {
margin-left: 0.5em;
margin-right: 0.5em;
}
div.navigation {
position: relative;
text-align: center;
font-size: 85%;
margin-bottom: 0em;
margin-top: 0em;
padding-top: 2em;
padding-bottom: 0.2em;
padding-left: 2em;
padding-right: 2em;
}
div.navigation ul {
list-style: none;
}
div.navigation ul li {
display: inline;
/* top right bottom left */
margin: 0 0 0 -1em;
border: none;
padding: 0 1em 0 1em;
}
div.navigation ul li a:link {
background-image: linear-gradient(#0000ee, #0000ee),
linear-gradient(#0000ee, #0000ee);
background-size: 0% 0.1em;
background-position-y: 100%;
background-position-x: 0%, 100%;
background-repeat: no-repeat;
transition: background-size 0.2s ease-in-out;
text-decoration: underline;
}
div.navigation ul li a:hover {
background-size: 100% 0.1em;
}
div.navigation2 {
position: inherit;
border-bottom: medium solid #000;
text-align: center;
font-weight: bold;
color: #777;
font-size: 90%;
margin-bottom: 0em;
margin-top: 0em;
margin-left: 0em;
margin-right: 0em;
padding-top: 0em;
padding-bottom: 0.2em;
padding-left: 0em;
padding-right: 0em;
}
div.navigation2 > ul {
list-style-position: outside;
list-style-type: none;
display: flex;
padding-left: 1em;
padding-right: 1em;
margin-left: 0em;
margin-right: 0em;
justify-content: space-between;
}
div.navigation2 > ul > li {
display: inline;
/* top right bottom left */
margin: 0 0 0 0;
border: none;
padding: 0 1em 0 1em;
}
div.navigation2 > ul:before {
content: " 🡄 ";
text-decoration: none;
color: #000;
}
div.navigation2 > ul:after {
content: " 🡆 ";
text-decoration: none;
color: #000;
}
div.navigation2 > ul > li:first-of-type {
margin-right: 1em;
}
div.navigation2 > ul > li:last-of-type {
margin-left: 1em;
}
div.navigation2 > ul > li > a[href] {
color: #000;
}
div.navigation2 > ul > li > a:link {
background-image: linear-gradient(#0000ee, #0000ee),
linear-gradient(#0000ee, #0000ee);
background-size: 0% 0.1em;
background-position-y: 100%;
background-position-x: 0%, 100%;
background-repeat: no-repeat;
transition: background-size 0.2s ease-in-out;
text-decoration: underline;
}
div.navigation2 p a:hover {
background-size: 100% 0.1em;
}
div.error {
border: thin solid #000;
background-image: repeating-linear-gradient(#f44, #a88 10%, #f44 100%);
padding-left: 1em;
padding-right: 1em;
box-shadow: 0.4em 0.4em 0.4em #555;
}
@keyframes slidein {
from {
margin-left: 100%;
width: 300%;
}
to {
margin-left: 0%;
width: 100%;
}
}
div.error h1 {
animation-duration: 1s;
animation-name: slidein;
margin-left: 0%;
}
div.error p.notfound {
font-family: monospace;
animation-duration: 2s;
animation-name: slidein;
}
div.monthly {
background-image: repeating-linear-gradient(#ccc, #ddd 10%, #ccd 80%);
padding-left: 0em;
padding-right: 0em;
box-shadow: 0.4em 0.4em 0.4em #555;
border: thin solid #000;
}
div.monthly > dl > dt {
font-weight: bold;
}
div.monthly dl dd dl dt:hover {
background-color: #ff4;
}
div.monthly dl dd dl dt:hover + dd {
background-color: #ff4;
}
div.post {
background-image: repeating-linear-gradient(#ccc, #ddd 10%, #ccd 80%);
padding-left: 0em;
padding-right: 0em;
box-shadow: 0.4em 0.4em 0.4em #555;
border: thin solid #000;
}
div.post:after {
visibility: hidden;
display: block;
font-size: 0;
content: " ";
clear: left;
height: 0;
}
div.post > h1,h2,h3,h4 {
margin-left: 1em;
margin-right: 1em;
}
div.post > p {
margin-left: 1em;
margin-right: 1em;
}
div.post a:link {
background-image: linear-gradient(#0000ee, #0000ee);
background-size: 0% 0.1em;
background-position-y: 100%;
background-position-x: 50%;
background-repeat: no-repeat;
transition: background-size 0.2s ease-in-out;
text-decoration: underline;
}
div.post span.date {
box-shadow: 0.1em 0.1em 0.1em #555;
text-decoration: auto;
padding-left: 0.5em;
padding-right: 0.5em;
color: #555;
border-radius: 2.4em;
# text-decoration: -moz-none;
# background-image: -moz-elementblack;
}
div.post a:hover {
background-size: 100% 0.1em;
}
div.post a.readon {
border-radius: 0.3em;
border: thin solid #222;
padding-left: 0.25em;
padding-right: 0.25em;
padding-top: 0.1em;
padding-bottom: 0.1em;
margin-left: 0.2em;
background-image: radial-gradient(ellipse farthest-corner at 30% 20%,
#c6c6c6 20%, #1c1c1c 120%);
background-size: 100%;
box-shadow: 0.2em 0.2em #8f8f8f;
text-align: center;
color: #444;
text-shadow: 0.1em 0.1em #ccc;
text-decoration: none;
font-family: serif;
white-space: nowrap;
position: relative;
}
div.post a.readon:hover {
background-image: radial-gradient(ellipse farthest-corner at 30% 20%,
#767676 20%, #7c7c7c 120%);
color: #000;
}
div.post a.readon[title]:after {
content: "Via: " attr(title);
position: relative;
font-size: 95%;
font-weight: bold;
left: 120%;
color: #222;
}
div.post > ul li {
margin-left: 2em;
border-radius: 1.5em;
background-image: repeating-linear-gradient(#ccc, #ddd 10%, #ccd 20%);
}
div.post blockquote {
quotes: "«" "»" "‘" "’";
font-family: serif;
text-align: left;
margin-top: 0em;
margin-bottom: 0em;
padding: 2em;
}
div.post blockquote + blockquote {
border-radius: 2em;
}
div.post blockquote:before {
color: #444;
content: open-quote;
font-size: 225%;
line-height: 10%;
margin-right: 0.25em;
vertical-align: 0.6em;
text-shadow: 0.1em 0.1em 0.1em #555;
}
div.post blockquote:after {
color: #444;
content: close-quote;
font-size: 225%;
line-height: 10%;
margin-left: 0.25em;
vertical-align: -0.3em;
text-shadow: 0.1em 0.1em 0.1em #555;
}
div.post blockquote:empty:before {
content: none;
}
div.post blockquote:empty:after {
content: none;
}
div.post blockquote p {
padding-bottom: 0em;
}
div.post h1 {
width: 80%;
text-align: left;
font-size: 125%;
}
div.post p.author {
text-align: right;
font-size: 80%;
}
div.post > p.dropcap-first:first-letter {
text-shadow: #888 0.1em 0.1em 0.1em;
float: left;
font-size: 200%;
z-index: 1;
position: absolute;
line-height: 90%;
font-family: Times,Georgia,serif;
}
div.post a.readon {
}
div.post a.readon[title]:after {
content: "Via: " attr(title);
position: absolute;
font-size: 95%;
font-weight: bold;
left: 120%;
color: #222;
}
div.post a.readon[title]:hover:after {
color: #444;
}
div.post img {
float: left;
padding: 0.3em 0.1em 0.3em 0.1em;
box-shadow: 0.4em 0.4em 0.4em #222;
border: medium solid #aaa;
border-radius: 2.5em;
margin-top: -0.5em;
margin-right: 1em;
margin-bottom: 1em;
max-width: 30%;
}
div.post img:hover {
transform: scale(1.15); /* (105% zoom - Note: if the zoom is too large, it will go outside of the viewport) */
/* opacity: 0.3; */
}
div.feedlist {
position: relative;
float: right;
max-width: 20%;
font-size: 75%;
padding: 1em;
border-top: thin solid #000;
border-bottom: thin solid #000;
border-left: thin solid #000;
background-image: url("/Images/F1F1F1E9E9E9CACACAFFFFFF_108.png");
}
/* hide Other Sites feedlist when empty */
div.feedlist:not(:has(div)) {
visibility: hidden;
}
div.feedlist > h1,h2,h3,h4 {
margin-left: 0em;
margin-right: 0em;
}
div.feedlist div {
padding: 0.5em;
border: thin solid #aaa;
border-radius: 1.5em;
margin-bottom: 0.5em;
}
div.feedlist div h3 a:link, div.feedlist > div > h5 > a:link {
background-image: linear-gradient(#0000ee, #0000ee);
background-size: 0% 0.1em;
background-position-y: 100%;
background-position-x: 100%;
background-repeat: no-repeat;
transition: background-size 0.2s ease-in-out;
}
div.feedlist div h3 a:hover, div.feedlist > div > h5 > a:hover {
background-size: 100% 0.1em;
background-position-x: 0%;
}
div.feedlist div h3:hover ~ h5:after {
content: " •";
}
div.feedlist div blockquote {
padding: 0em;
border-bottom: thin solid #000;
}
div.feedlist div blockquote:last-of-type {
border-bottom: none;
}
h1, h2, h3, h4, h5, h6{
font-weight: bold;
font-family: "Liberation Serif", FreeSerif, serif;
}
h1 {
font-size: 200%;
}
h2 {
font-size: 150%;
}
h3 {
font-size: 125%;
}
h4 {
font-size: 115%;
}
h6 {
font-size: 110%;
padding: 1.5em;
border: thin solid #aaa;
border-radius: 1.5em;
}
div.search form input {
border: medium solid #444;
}
div.search form span.period {
display: none;
padding-left: 4em;
}
div.search form span.scope {
display: none;
padding-left: 4em;
}
div.search form input#field01:checked ~ span.period {
display: block;
}
div.search form input#field02:checked ~ span.scope {
display: block;
}
div.footer {
clear: both;
border-top: thin solid #000;
text-align: center;
margin-left: 25%;
margin-top: 2em;
max-width: 50%;
padding: 1em;
font-size: 85%;
}
div.footer ul {
list-style: none;
}
div.footer ul li {
display: inline;
/* top right bottom left */
margin: 0 0 0 -1em;
border: none;
padding: 0 1em 0 1em;
}
iframe {
box-shadow: 15px 15px 15px #444;
float:right;
margin-top: 2em;
margin-bottom: 0.2em;
margin-left: 2em;
margin-right: 2em;
}
./about.shtml
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Tux Machines</title>
<meta name="dc.date.created" content="20220712" />
<meta name="dc.description" content="Do you waddle the waddle" />
<link rel="stylesheet" href="/CSS/tuxmachines.css"
media="screen" type="text/css" />
</head>
<body>
<!--#include virtual="/header.html"-->
<!--#include virtual="/navigation.html"-->
<!--#include virtual="/feeds.html"-->
<div class="post">
<h1>About Tux Machines</h1>
<p><em>Tux Machines</em> is a popular news site focusing on Free/libre and Open Source software, especially GNU/Linux. Founded by Susan (srlinuxx) in 2004, the site aims to share relevant news with its valued community of readers.</p>
<h2>Scope of coverage</h2>
<p>The site places great focus on GNU, Linux, and other intricate systems that utilise these, such as Android, Chrome OS, and Tizen. Of lesser interest are issues that relate <em>purely</em> to development and Free/Open Source software. Games, applications, instructional posts and proprietary software are habitually covered, but they are grouped and posted only periodically. <em>Tux Machines</em> is primarily focused on Linux, but it occasionally also covers BSD/UNIX, Minix, and lesser known operation systems. Some of our news sources include standards, antitrust and so on.</p>
<h2>Contact Details</h2>
<p>See <a href="/contact.shtml">our contacts page</a> for up-to-date details. Communication is also facilitated by <a href="/irc.shtml">our IRC channel</a>.</p>
<h2>Going Ads-free in 2013</h2>
<p>From 2013 onward, <em>Tux Machines</em> has not had ads. Instead it relies on readers' support and is run as a public service. <a href="#top">█</a></p>
<ul>
<li><a href="http://www.tuxmachines.org/node/15555">Help Support TM</a></li>
<li><a href="http://www.tuxmachines.org/node/21601">Wall of Appreciation</a></li>
</ul>
<h2>Going Raw HTML in 2022</h2>
<p>
From mid-2022 onward, the heavy, resource-intensive content management system was dropped and a leaner (perhaps too lean) static site generator used instead. This makes technical administration and speed much better.
</p>
</div>
<h2 class="latest">Other Recent Tux Machines Posts</h2>
<!--#include virtual="/latest-news.html"-->
<!--#include virtual="/footer.html"-->
</body>
</html>
./contact.shtml
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Contact Tux Machines</title>
<meta name="dc.date.created" content="20220712" />
<meta name="dc.description" content="Do you waddle the waddle" />
<link rel="stylesheet" href="/CSS/tuxmachines.css"
media="screen" type="text/css" />
</head>
<body>
<!--#include virtual="/header.html"-->
<!--#include virtual="/navigation.html"-->
<!--#include virtual="/feeds.html"-->
<div class="post">
<h1>Contact Tux Machines</h1>
<p><em>Tux Machines</em> is run by Bytes Media, but it is originally the Web site of Susan Linton, better known as "srlinuxx". </p>
</div>
<h2 class="latest">Other Recent Tux Machines Posts</h2>
<!--#include virtual="/latest-news.html"-->
<!--#include virtual="/footer.html"-->
</body>
</html>
./footer.html
<div class="footer"> <ul> <li><a href="/index.shtml">Home</a></li> <li><a href="/about.shtml">About</a></li> <li><a href="/irc.shtml">IRC</a></li> <!-- <li><a href="/search.html">Search</a></li> --> <li><a href="/feed.xml">Feed</a></li> </ul> </div>
./header.html
<div class="header"> <img src="/Images/tuxmachines.logo.svg" width="128" height="96" alt=""/> <div> <h1>Tux Machines</h1> <p>Do you waddle the waddle?</p> </div> </div>
./index.shtml
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Tux Machines</title>
<meta name="dc.date.created" content="20220712" />
<meta name="dc.description" content="Do you waddle the waddle" />
<link rel="stylesheet" href="/CSS/tuxmachines.css"
media="screen" type="text/css" />
<style type="text/css" media="print">
article a {
font-weight: bolder;
text-decoration: none;
}
article a[href^=http]:after {
content:" <" attr(href) "> ";
}
</style>
<meta name="DC.Creator" content="Tux Machines" />
<link rel="alternate" title="Tux Machines RSS"
href="/feed.xml" type="application/rss+xml" />
</head>
<body>
<!--#include virtual="/header.html"-->
<!--#include virtual="/feeds.html"-->
<div class="post">
<!--#include virtual="/navigation.html"-->
<p>
<em>Tux Machines</em> places great emphasis on covering both GNU and Linux.
We occasionally also cover other Free and Open Source operating systems,
as well as games, applications, instructional posts, and, very occasionally, relevant proprietary software.
</p>
</div>
<h1 class="recent">Recent Tux Machines Posts</h1>
<!--#include virtual="/latest-news.html"-->
<!--#include virtual="/footer.html"-->
</body>
</html>
./irc.shtml
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>IRC and Tux Machines</title>
<meta name="dc.date.created" content="20220712" />
<meta name="dc.description" content="Do you waddle the waddle" />
<link rel="stylesheet" href="/CSS/tuxmachines.css"
media="screen" type="text/css" />
</head>
<body>
<!--#include virtual="/header.html"-->
<!--#include virtual="/navigation.html"-->
<!--#include virtual="/feeds.html"-->
<div class="post">
<h1>IRC and Tux Machines</h1>
<p><em>Tux Machines</em> invites further discussion of the shared articles on Internet Relay Chat (IRC)...</p>
<p>The IRC channel is <code>#tuxmachines</code> at <code>irc.techrights.org</code>. To use your <em>own</em> IRC client, join channel <code>#tuxmachines</code> in <a href="irc://irc.techrights.org"><code>techrights.org</code></a>. The <a href="http://www.tuxmachines.org/node/132043">IRC logs can be viewed or downloaded</a>.</p>
<details>
<summary>Try the Kiwiirc browser-based client if your browser is encumbered by JavaScript:</summary>
<iframe src="https://kiwiirc.com/nextclient/irc.techrights.org/?nick=tr_guest|?#tuxmachines" style="width:470px; height:450px; border:4px double gray; box-shadow: 5px 5px 5px #222;"></iframe>
</details>
<p>
Another option (if the above does not work) is Mibbit, but there may be privacy risks.
</p>
<details>
<summary>Try the Mibbit browser-based client if your browser is encumbered by JavaScript:</summary>
<iframe width="470" height="380" scrolling="no" frameborder="0" src="https://widget.mibbit.com/?settings=7ca12664887d4b6e7a0fa6552f9e0de6&server=irc.techrights.org&autoConnect=true&channel=%23tuxmachines" style="border:4px double gray; box-shadow: 5px 5px 5px #222;">
</iframe>
</details>
<p>Use any of the above. Again, use <u>with caution</u>. There may be privacy concerns with using the browser-based clients, so try to use your own IRC client before trying browser-based clients like Mibbit or Kiwiirc. Download an IRC client and enter the required details into it. The Internet Relay Chat (IRC) channel is <code>#tuxmachines</code> at the IRC network <a href="irc://irc.techrights.org"><code>techrights.org</code></a>.</p>
<p>The IRC chats can be used for direct messaging as well.</p>
</div>
<h2 class="latest">Other Recent Tux Machines Posts</h2>
<!--#include virtual="/latest-news.html"-->
<!--#include virtual="/footer.html"-->
</body>
</html>
./navigation.html
<div class="navigation"> <!-- <p class="alpha"> (ℹ) These pages are on an <b>alpha</b> (tesing) site. <br /> For the <b>production/live site</b>, please see <a href="http://tuxmachines.org/"><code>tuxmachines.org</code></a> instead. </p> --> <ul> <li><a href="/index.shtml">Home</a></li> <li><a href="/about.shtml">About</a></li> <li><a href="/irc.shtml">IRC</a></li> <li><a href="gemini://gemini.tuxmachines.org/">Gemini Edition</a></li> <!-- <li><a href="/search.html">Search</a></li> --> <li><a href="/feed.xml">Feed</a></li> <!-- <li><a href=""></a></li> --> </ul> </div>
./robots.txt
# $Id: robots.txt,v 1.7.2.3 2008/12/10 20:24:38 drumm Exp $ # # robots.txt # # This file is to prevent the crawling and indexing of certain parts # of your site by web crawlers and spiders run by sites like Yahoo! # and Google. By telling these "robots" where not to go on your site, # you save bandwidth and server resources. # # This file will be ignored unless it is at the root of your host: # Used: http://example.com/robots.txt # Ignored: http://example.com/site/robots.txt # # For more information about the robots.txt standard, see: # http://www.robotstxt.org/wc/robots.html # # For syntax checking, see: # http://www.sxw.org.uk/computing/robots/check.html # Baiduspider # User-agent: Baiduspider # Disallow: / User-agent: * Crawl-delay: 10 # Directories Disallow: /includes/ Disallow: /misc/ Disallow: /modules/ Disallow: /profiles/ Disallow: /scripts/ Disallow: /sites/ Disallow: /themes/ # Files Disallow: /CHANGELOG.txt Disallow: /cron.php Disallow: /INSTALL.mysql.txt Disallow: /INSTALL.pgsql.txt Disallow: /install.php Disallow: /INSTALL.txt Disallow: /LICENSE.txt Disallow: /MAINTAINERS.txt Disallow: /update.php Disallow: /UPGRADE.txt Disallow: /xmlrpc.php # Paths (clean URLs) Disallow: /admin/ Disallow: /comment/reply/ Disallow: /contact/ Disallow: /logout/ Disallow: /node/add/ Disallow: /search/ Disallow: /user/register/ Disallow: /user/password/ Disallow: /user/login/ # Paths (no clean URLs) Disallow: /?q=admin/ Disallow: /?q=comment/reply/ Disallow: /?q=contact/ Disallow: /?q=logout/ Disallow: /?q=node/add/ Disallow: /?q=search/ Disallow: /?q=user/password/ Disallow: /?q=user/register/ Disallow: /?q=user/login/
./search.py
#!/usr/bin/env python3
# see also :
# https://www.blopig.com/blog/2021/05/hosting-multiple-flask-apps-using-apache-mod_wsgi/
# https://flask.palletsprojects.com/en/latest/quickstart/
# https://flask.palletsprojects.com/en/latest/quickstart/#rendering-templates
from flask import Flask, request, render_template
from time import time
app = Flask(__name__)
# "application" is the variable name expected by mod_wsgi
application = app
a = 0
@app.route("/")
# @app.route('/time')
@app.route("/", methods=['POST', 'GET'])
def bar():
q = request.args.get('q','')
if request.method == 'GET' and q != '':
return render_template('search-results.html', result=q)
else:
return render_template('new-search.html')
return b
if __name__ == "__main__":
# app.run()
app.run(host='0.0.0.0', debug=True)
./00_README
# the following are hard links to the same files # in the other directory: feeds.html footer.html header.html latest-news.html navigation.html # the other files are local
./new-search.html
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Tux Machines</title>
<link rel="stylesheet" href="/CSS/tuxmachines.css"
media="screen" type="text/css" />
</head>
<body>
{# the included files are hardlinks from the main directory #}
{% include 'header.html' %}
{% include 'navigation.html' %}
<div class="search">
{% include 'feeds.html' %}
<h1>New Search</h1>
<p>
<form action="" method="GET">Search for:
<input type="text" name="q" size="50" />
<input type="submit" value="Search" />
<br />
<input type="radio" name="term" value="dc.date"
id="field01" />
<label for="field01">By Date</label>
<input type="radio" name="term" value="dc.title"
id="field02" checked="1" />
<label for="field02">By Title</label>
<span class="period">
<input type="radio" name="period" value="le" id="field03"
class="pd" />
<label for="field03">Before ( < )</label>
<input type="radio" name="period" value="lt" id="field04"
class="pd" />
<label for="field04">Up to ( < = )</label>
<input type="radio" name="period" value="eq" id="field05"
class="pd" />
<label for="field05">Equals ( = )</label>
<input type="radio" name="period" value="ge" id="field06"
class="pd" checked="1" />
<label for="field06">Starting ( > = )</label>
<input type="radio" name="period" value="gt" id="field07"
class="pd" />
<label for="field07">After ( > )</label>
</span>
<span class="scope">
<input type="radio" name="scope" value="ct" id="field08"
class="sp" checked="1" />
<label for="field08">Contains</label>
<input type="radio" name="scope" value="eq" id="field09"
class="sp" />
<label for="field09">Exact match</label>
</span>
</form>
</p>
</div>
<h1>Other Recent Tux Machines' Posts</h1>
{% include 'latest-news.html' %}
{% include 'footer.html' %}
</body>
</html>
./search-results.html
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Tux Machines</title>
</head>
<body>
<div class="search">
{% if result %}
<h1>Results:</h1>
<p>{{ result }}!</p>
{% else %}
<h1>No results, do a new Search</h1>
{% endif %}
<p>
<form action = "" method = "GET">
Search for: <input type = "text" name = "q" size="50" />
</form>
</p>
</div>
</body>
</html>
./ReadOn.pm
package TuxMachines::ReadOn 0.02;
use utf8;
use parent qw(Exporter);
use strict;
use warnings;
use 5.010; # state() requires 5.010 or later
our @EXPORT = qw();
our @EXPORT_OK = qw(ReadOn);
sub ReadOn {
require HTML::TreeBuilder::XPath;
require URI;
my ($body) = (@_);
my $xhtml = HTML::TreeBuilder::XPath->new;
$xhtml->implicit_tags(1);
$xhtml->no_expand_entities(1);
$xhtml->parse($body);
# use the href attribute for a title, if no title attribute exists
for my $r ($xhtml->findnodes('//a[@class="readon"]')) {
if (my $href = $r->attr('href')) {
if (!defined($r->attr('title')) || !$r->attr('title')) {
my $uri = URI->new($href);
my $site = siteName($uri->host);
if ($site) {
$r->attr('title', $site);
}
}
}
}
# extract just the body
for my $b ($xhtml->findnodes('//body')) {
$body = $b->as_HTML('', ' ', {});
}
# trim the extraneous markup, leaving just the body contents
$body =~ s|\s*<body[^>]*>||m;
$body =~ s|</body>||m;
$xhtml->destroy;
return($body);
}
sub siteName {
my ($site) = (@_);
state %lookup = (
"access.redhat.com" => "Red Hat",
"alicious.com" => "Alicious",
"allanmcrae.com" => "Allan McRae",
"alpinelinux.org" => "Alpine Linux",
"alternativenayk.wordpress.com" => "Nigel Ajay Kumar",
"anchor.fm" => "Spotify",
"andrealmeid.com" => "André Almeida",
"andreldm.com" => "André Miranda",
"android-developers.googleblog.com" => "Android",
"apachelog.wordpress.com" => "Harald Sitter",
"apcmag.com" => "Future US Inc",
"appuals.com" => "Appuals",
"archlinux.org" => "Arch Linux",
"arstechnica.com" => "Ars Technica",
"atlasvpn.com" => "Atlas VPN",
"benjiweber.co.uk" => "Benji Weber",
"berkeleydailyplanet.com" => "California",
"berry.osdn.jp" => "Berry Linux",
"bits.debian.org" => "Debian",
"blog.alteholz.eu" => "Thorsten Alteholz",
"blog.aurel32.net" => "Aurélien Jarno",
"blog.diasporafoundation.org" => "Diaspora Foundation",
"blog.dowhile0.org" => "Javier Martinez Canillas",
"blog.elementary.io" => "Elementary OS",
"blog.ffwll.ch" => "Daniel Vetter",
"blog.hypriot.com" => "Govinda Fichtner",
"blog.ipfs.io" => "IPFS",
"blog.jimmac.eu" => "Jakub Steiner",
"blog.jolla.com" => "Jolla",
"blog.josefsson.org" => "Simon Josefsson",
"blog.linuxgrrl.com" => "Máirín Duffy",
"blog.lxde.org" => "LXDE",
"blog.mageia.org" => "Maageia",
"blog.morphisec.com" => "Morphisec Ltd.",
"blog.neon.kde.org" => "KDE Neon",
"blog.nightly.mozilla.org" => "Mozilla",
"blog.openshift.com" => "Red Hat",
"blog.remirepo.net" => "Remi's RPM Repository",
"blog.reversinglabs.com" => "Reversing Labs",
"blog.sergiodj.net" => "Sergio Durigan Junior",
"blog.slyon.de" => "Lukas Märdian",
"blog.sonatype.com" => "Sonatype Inc",
"blog.tingping.se" => "TingPing",
"blog.vacs.fr" => "Stephane Carrez",
"blog.vladzahorodnii.com" => "Vlad Zahorodnii",
"blog.xfce.org" => "XFCE",
"blog.xuche.ml" => "Shidao",
"blogops.mixinet.net" => "Sergio Talens-Oliag",
"blogs.blackberry.com" => "BlackBerry Ltd",
"blogs.fsfe.org" => "FSFE",
"blogs.gnome.org" => "GNOME",
"blogs.igalia.com" => "Igallia S L",
"blogs.kde.org" => "KDE",
"blogs.oracle.com" => "Oracle",
"bluesnews.com" => "Blue's News",
"bluewhite64.arny.ro" => "Bluewhite Linux",
"www.bluewhite-linux.com" => "Bluewhite Linux",
"bonedaddy.net" => "Bonedaddy",
"bootlin.com" => "Bootlin",
"bugs.launchpad.net" => "Canonical Ltd",
"bugzilla.novell.com" => "Micro Focus International plc",
"buildvirtual.net" => "buildVirtual",
"bytesmedia.co.uk" => "Bytes Media",
"cairographics.org" => "Cairo",
"calibre-ebook.com" => "Calibre",
"caolanm.blogspot.com" => "Caolán McNamara",
"carlosbecker.dev" => "Carlos Becker",
"cdimage.debian.org" => "Debian",
"cockpit-project.org" => "Red Hat",
"condresos.codelinsoft.it" => "Condres OS",
"containerjournal.com" => "Techstrong Group, Inc.",
"crosscat.me" => "Isabella Basso do Amaral",
"cubexyz.blogspot.com" => "Cubexyz",
"cybersecuritynews.com" => "Cyber Security News",
"danigm.net" => "Daniel Garcia Moreno",
"dantti.wordpress.com" => "Dantti",
"darryldias.me" => "Darryl Dias",
"dasublogbyprashanth.blogspot.com" => "Prashanth",
"dbaxps.blogspot.com" => "Boris Derzhavets",
"ddumont.wordpress.com" => "Dominhique Dumont",
"debarshiray.wordpress.com" => "Debarshi Ray",
"debian.community" => "Debian",
"debian.tu-bs.de" => "Debian",
"debianadmin.com" => "Debian",
"design.blog.documentfoundation.org" => "The Document Foundation",
"dev.blog.documentfoundation.org" => "The Document Foundation",
"devblogs.microsoft.com" => "Microsoft Corporation",
"devconnected.com" => "Devconnected",
"developer.kde.org" => "KDE",
"diaspora.psyco.fr" => "Diaspora",
"dietpi.com" => "DietPi",
"digg.com" => "Digg",
"discourse.libsdl.org" => "SDL",
"discourse.llvm.org" => "LLVM",
"discourse.ubuntu.com" => "Ubuntu",
"disney.go.com" => "The Walt Disney Company",
"dissociatedpress.net" => "Dissociated Press",
"distro.ibiblio.org" => "Ibiblio",
"dominique.leuenberger.net" => "Dominique Leuenberger",
"doom3.filefront.com" => "GameFront Ltd",
"gamefront.com" => "GameFront Ltd",
"download.nvidia.com" => "Nvidia",
"download.opensuse.org" => "OpenSUSE",
"draculaservers.com" => "Dracula Servers",
"dropbear.xyz" => "Craig Small",
"drupal.org" => "Drupal",
"dustycloud.org" => "Christine Lemmer-Webber",
"dvratil.cz" => "Daniel Vrátil",
"dylanmc.ca" => "Dylan McCall",
"ein.iconnect007.com" => "I-Connect007",
"eiosifidis.blogspot.com" => "Efstathios Iosifidis",
"markcockrell.com" => "Mark Cockrell",
"elessar-space.blogspot.com" => "Amrit Borah",
"elivecd.org" => "Elive",
"emersion.fr" => "Simon Ser",
"emmabuntus.org" => "Emmabuntüs",
"en.opensuse.org" => "OpenSUSE",
"en.wikipedia.org" => "Wikipedia",
"endeavouros.com" => "EndeavourOS",
"entertainment-focus.com" => "Entertainment Focus",
"esr.ibiblio.org" => "ESR",
"evolution-security.com" => "Evolution Security",
"fale.io" => "Fabio Alessandro Locati",
"fasterland.net" => "Francesco Mondello",
"featherlinux.berlios.de" => "Felipe Borges",
"features.opensuse.org" => "OpenSUSE",
"feeding.cloud.geek.nz" => "François Marier",
"feeds.feedburner.com" => "FeedBurner",
"fhek.gitlab.io" => "Felipe Kinoshita",
"fisica.ufpr.br" => "Universidade Federal do Paraná",
"fitzcarraldoblog.wordpress.com" => "Fitzcarraldo",
"flavio.castelli.me" => "Flavio Castelli",
"flosslinuxblog.blogspot.com" => "Andrew Cater",
"fluxbuntu.org" => "Ubuntu",
"foolcontrol.org" => "Adnan Hodzic",
"forgesvn1.novell.com" => "Novell",
"forum.manjaro.org" => "Manjaro",
"forum.openmandriva.org" => "OpenMandriva",
"forum.opnsense.org" => "OPNsense",
"forum.porteus.org" => "Porteus",
"forums.fedoraforum.org" => "IBM",
"forums.gentoo.org" => "Gentoo",
"fossa.com" => "FOSSA Inc",
"fossweekly.beehiiv.com" => "FOSS Weekly",
"freeaptitude.altervista.org" => "Fabio Mucciante",
"freedesktop.org" => "Software in the Public Interest Inc",
"freedom-to-tinker.com" => "Freedom to Tinker",
"frictionalgames.com" => "Frictional Games AB",
"fridge.ubuntu.com" => "Canonical Ltd",
"frugalware.org" => "Frugalware",
"fsfellowship.news" => "Free Software Fellowship",
"ftp.osuosl.org" => "Open Source Lab",
"ftp.sh.cvut.cz" => "Kancelář klubu Silicon Hill",
"fullcirclemagazine.org" => "Canonical Ltd",
"futurumresearch.com" => "Futurum Research",
"gadgets-africa.com" => "Gadgets Africa",
"gadgetstouse.com" => "Gadgets to Use",
"game-news24.com" => "Game-News24",
"geekfeminism.org" => "Geek Feminism Blog",
"gemini.circumlunar.space" => "Project Gemini",
"gentoo-blog.de" => "Gentoo",
"getsol.us" => "Solus",
"gfldex.wordpress.com" => "Gfldex",
"glandium.org" => "Mike Hommey",
"glovesoff.substack.com" => "Gloves Off Linux",
"gnu.wildebeest.org" => "Mark J. Wielaard",
"gnunet.org" => "GNUnet e.V.",
"go.dev" => "Golang",
"goinglinux.com" => "Going Linux",
"gottcode.org" => "Graeme Gott",
"grep.be" => "Wouter Verhelst",
"gstreamer.freedesktop.org" => "GStreamer",
"hackernet.in" => "HackerNet.in",
"hardware.slashdot.org" => "Slashdot",
"help.ubuntu.com" => "Canonical Ltd",
"honk.sigxcpu.org" => "Guido Günther",
"hosted.filefront.com" => "GameFront Ltd",
"hpjansson.org" => "Hans Petter Jansson",
"ignapk.blogspot.com" => "Ignacy Kuchciński",
"info.kpmg.us" => "KPMG LLP",
"informatique-libre.be" => "Sébastien Wilmet",
"insights.sei.cmu.edu" => "Sébastien Wilmet",
"insights.ubuntu.com" => "Canonical Ltd",
"iranzo.io" => "Pablo Iranzo Gómez",
"irlpodcast.org" => "Mozilla Corporation",
"istio.io" => "Istio",
"itsubuntu.com" => "It's Ubuntu",
"jmmv.dev" => "Julio Merino",
"jmtd.net" => "Jonathan Dowland",
"julialang.org" => "The Julia Programming Language",
"k-d-w.org" => "Sebastian Pölsterl",
"k7r.eu" => "Matthias Kirschner",
"kaisenlinux.org" => "Kaisen Linux",
"kaosx.us" => "KaOS",
"kate-editor.org" => "KDE",
"kde-apps.org" => "Libre Software and Apps for Linux",
"kde.org" => "KDE",
"kdenlive.org" => "Kdenlive",
"kennyvn.com" => "Kenneth",
"kernelnewbies.org" => "Kernelnewbies",
"kerneltalks.com" => "KernelTalks",
"kifarunix.com" => "Kifarunix",
"kiwitcms.org" => "Kiwi TCMS",
"knowledgebasement.com" => "Knowledge Basement",
"kparal.wordpress.com" => "Kamil Páral",
"krshrimali.github.io" => "Kushashwa Ravi Shrimali",
"kubuntu.org" => "Kubuntu",
"kushaldas.in" => "Kushal Das",
"kwort.org" => "Kwort Linux",
"latenightlinux.com" => "Late Night Linux",
"lateweb.info" => "LateWeb.info",
"launchpad.net" => "Canonical Ltd",
"laysrodriguesdev.wordpress.com" => "Lays Rodrigues",
"leancrew.com" => "Dr Drang",
"learn.sparkfun.com" => "SparkFun Electronics",
"levvvel.com" => "Bambeo GmbH",
"librearts.org" => "Libre Arts",
"linoxide.com" => "LinOxide",
"linux-apps.com" => "Libre Software and Apps for Linux",
"linux-video-tutorials.blogspot.com" => "Linux Video Tutorials",
"linux.bytesex.org" => "Gerd Knorr",
"linux.com" => "The Linux Foundation",
"linuxadmin.io" => "LinuxAdmin.io",
"linuxblog.darkduck.com" => "DarkDuck",
"linuxformat.com" => "Linux Format",
"linuxg.net" => "LinuxG.net",
"linuxiumcomau.blogspot.com" => "Linuxium",
"linuxlock.blogspot.com" => "ReGlue",
"linuxmint.com" => "Linux Mint",
"linuxnewbieguide.org" => "The Ultimate Linux Newbie Guide",
"linuxpackages.net" => "Linux Packages",
"linuxroutes.com" => "UxTechno",
"linuxscoop.com" => "Linux Scoop",
"linuxsecurityblog.com" => "Linux Security Blog",
"linuxstans.com" => "Linux Stans",
"linuxtoday.com" => "Linux Today",
"linuxtracker.org" => "Linux Tracker",
"linuxwebdevelopment.com" => "LinuxWebDevelopment",
"linuxwizardry.com" => "LinuxWizardry",
"liquidat.wordpress.com" => "Roland Wolters",
"lists.fedoraproject.org" => "IBM",
"lists.kde.org" => "KDE",
"lists.opensuse.org" => "OpenSUSE",
"lists.x.org" => "X.Org",
"live.debian.net" => "Debian",
"lkml.org" => "Linux",
"llunak.blogspot.com" => "Llunak",
"lore.kernel.org" => "Linux",
"lu.is" => "Luis Villa",
"lxde.sourceforge.net" => "LXDE",
"maboxlinux.org" => "Mabox Linux",
"machinarium.net" => "Aminita Design",
"magtheweekly.com" => "Mag The Weekly",
"mail.gnome.org" => "GNOME",
"mail.python.org" => "Python",
"mairacanal.github.io" => "Maíra Canal",
"major.io" => "Major Hayden",
"mandrivachronicles.blogspot.com" => "Mandriva",
"manishearth.github.io" => "Manish Goregaokar",
"manjaro.org" => "Manjaro",
"mapopa.blogspot.com" => "Popa Adrian Marius",
"martinheinz.dev" => "Martin Heinz",
"mastodon.technology" => "Fediverse",
"mate-desktop.org" => "MATE",
"matt-rickard.ghost.io" => "Matt Rickard",
"meanmicio.org" => "Luis Falcon",
"media.libreplanet.org" => "LibrePlanet",
"medibuntu.org" => "Ubuntu",
"meeksfamily.uk" => "Julia Meeks",
"melix99.wordpress.com" => "Marco Melorio",
"memcpy.io" => "Robert Foss",
"mivehind.net" => "Robbie Harwood",
"money.cnn.com" => "CNN",
"mypclinuxos.com" => "PCLinuxOS",
"mytechdecisions.com" => "TechDecisions",
"neptuneos.com" => "Neptune",
"net2.com" => "Net2.com",
"news.bbc.co.uk" => "BBC",
"news.opensuse.org" => "OpenSUSE",
"news.rub.de" => "Ruhr-Universität Bochum",
"news.slashdot.org" => "Slashdot",
"news.tuxmachines.org" => "Tux Machines",
"news.zdnet.com" => "Red Ventures",
"nextgentips.com" => "NextGenTips",
"ninja-ide.org" => "Ninja IDE",
"nixos.org" => "NixOS",
"nnethercote.github.io" => "Nicholas Nethercote",
"nokiapoweruser.com" => "NokiaPowerUser",
"nts.strzibny.name" => "Josef Strzibny",
"nullr0ute.com" => "Peter Robinson",
"nutyx.org" => "NuTyX",
"octopus.com" => "Octopus Deploy",
"odysee.com" => "LBRY",
"openqa-bites.github.io" => "openQA",
"opensuse-community.org" => "OpenSUSE",
"packit.dev" => "Packit",
"parabolagnulinux.org" => "Parabola GNU Linux",
"pclinuxos.com" => "PCLinuxOS",
"phandroid.com" => "Google LLC",
"phorolinux.com" => "Phoro Linux",
"picasa.google.com" => "Google",
"picasaweb.google.com" => "Google",
"pimpyourlinux.com" => "PimpYourLinux",
"pimylifeup.com" => "PiMyLifeUp",
"pine64.com" => "Pine64",
"pitivi.org" => "Pitivi",
"piunikaweb.com" => "PiunikaWeb",
"planet.archlinux.org" => "Arch Linux",
"planet.debian.org" => "Debian",
"planet.documentfoundation.org" => "The Document Foundation",
"planet.freedesktop.org" => "Free Desktop",
"planet.fsfe.org" => "FSFE",
"planet.gentoo.org" => "Gentoo",
"planet.gnome.org" => "GNOME",
"planet.gnu.org" => "GNU",
"planet.kernel.org" => "Linux",
"planet.mageia.org" => "Mageia",
"planet.mozilla.org" => "Mozilla",
"planet.opensuse.org" => "OpenSUSE",
"planet.slackware-id.org" => "Slackware",
"planet.ubuntu.com" => "Ubuntu",
"planetkde.org" => "KDE",
"planetpython.org" => "Python",
"plasma-mobile.org" => "KDE",
"plug-mirror.rcac.purdue.edu" => "uni Purdue",
"postmarketos.org" => "PostmarketOS",
"ppa.launchpad.net" => "Ubuntu",
"sourceforge.net" => "SourceForge",
"prezu.ca" => "Patryk Cisek",
"qa.blog.documentfoundation.org" => "The Document Foundation",
"quochungtran.github.io" => "Quoc Hung Tran",
"rafaelc.org" => "Rafael Cavalcanti",
"rakudo.org" => "Rakulang",
"raphlinus.github.io" => "Raph Levien",
"rcpmag.com" => "Redmond Channel Partner",
"redhatstackblog.redhat.com" => "Red Hat",
"redhat.com" => "Red Hat",
"redmonk.com" => "RedMonk",
"reproducible-builds.org" => "Reproducible Builds",
"researchsnipers.com" => "Research Snipers",
"retout.co.uk" => "Tim Retout",
"reviewjolla.blogspot.com" => "Sailfish OS Reviews",
"rhelblog.redhat.com" => "Red Hat",
"rk.edu.pl" => "RkBlog",
"rlworkman.net" => "Robby Workman",
"robohub.org" => "Robohub",
"robots.net" => "Robots.net",
"rockylinux.org" => "Rocky Linux",
"rojas.run" => "Reinold Rojas",
"rolisteam.org" => "Rolisteam",
"rwmj.wordpress.com" => "Richard WM Jones",
"samarthrajwrites.wordpress.com" => "Samarth Raj",
"samthursfield.wordpress.com" => "Sam Thursfield",
"saxenos.de" => "Saxenos",
"scribblesandsnaps.com" => "Scribblesandsnaps",
"sfconservancy.org" => "Software Freedom Convervancy",
"shallowsky.com" => "Akkana Peck",
"skrooge.org" => "Skrooge",
"slackbuilds.org" => "Slakware",
"slackware.com" => "Slackware",
"slackware.cs.utah.edu" => "uni Utah",
"slated.org" => "Slated",
"sleepmap.de" => "David Runge",
"smallcultfollowing.com" => "Niko Matsakis",
"smlr.us" => "Sunday Morning Linux Review",
"snehit.dev" => "Snehit Sah",
"software.jaos.org" => "Jason Woodward",
"software.opensuse.org" => "OpenSUSE",
"sourceforge.net" => "SourceForge",
"sourceware.org" => "Sourceware.org",
"sparkylinux.org" => "SparkyLinux",
"speedysense.com" => "Speedy Sense",
"spidermonkey.dev" => "Mozilla",
"spwhitton.name" => "Sean Whitton",
"standardsandfreedom.net" => "Charles-H. Schulz",
"steamcommunity.com" => "Steam Community",
"stmuk.wordpress.com" => "Steve Mynott",
"suicide.fyi" => "Debian Community News",
"support.frictionalgames.com" => "Frictional Games AB",
"susewiki.org" => "OpenSUSE",
"syslinux.zytor.com" => "Syslinux",
"wiki.syslinux.org" => "Syslinux",
"systemoverlord.com" => "David Tomaschik",
"szopa.org.pl" => "Adam Szopa",
"t2sde.org" => "T2 SDE",
"tales-aparecida.github.io" => "Tales L. Aparecida",
"taming-libreoffice.com" => "Jean Hollis Weber",
"techbase.kde.org" => "KDE",
"technastic.com" => "TechBullion",
"technode.global" => "TechNode Global",
"techpp.com" => "TechPP",
"techrights.org" => "Techrights",
"techthelead.com" => "TechTheLead",
"tecnocode.co.uk" => "Philip Withnall",
"thanatermesis.org" => "Samuel Flores Baggen",
"thatlinuxthing.blogspot.com" => "That Linux Thing",
"the-gadgeteer.com" => "The Gadgeteer",
"thefutureofthings.com" => "The Future of Things",
"thegadgetflow.com" => "The Gadget Flow",
"thegcomprisivy.wordpress.com" => "Aastha Chauhan",
"theweeklychallenge.org" => "The Weekly Challeng: Perl & Raku",
"thishosting.rocks" => "ThisHostingRocks",
"tigeroakes.com" => "Tiger Oakes",
"timur.hu" => "Timur Kristóf",
"tipsonubuntu.com" => "TipsOnUbuntu",
"tipsonunix.com" => "TipsOnUnix",
"tldp.org" => "The Linux Documentation Project",
"tllts.org" => "The Linux Link Tech Show",
"tsdgeos.blogspot.com" => "Albert Astals Cid",
"tuxmachines.org" => "Tux Machines",
"twit.tv" => "This Week in Tech",
"twitter.com" => "Twitter",
"ubports.com" => "UBports Foundation",
"ubuntu-mate.org" => "Ubuntu",
"ubuntu-news.org" => "Ubuntu",
"ubuntuhak.blogspot.com" => "UBuntu",
"ubuntunext.com" => "Ubuntu",
"ubuntupodcast.org" => "Ubuntu",
"ubuntuportal.com" => "Ubuntu",
"ubuntustudio.org" => "Ubuntu",
"ultimateedition.info" => "Ultimate Edition",
"unvanquished.net" => "Unvanquished",
"upload.wikimedia.org" => "Wikimedia",
"uptime.netcraft.com" => "Netcraft",
"utkarsh2401.blogspot.com" => "Utkarsh Gandhi",
"valdyas.org" => "Irina Rempt",
"venthur.de" => "Bastian Venthur",
"vfxplatform.com" => "VFX Reference Platform",
"video.linuxfoundation.org" => "The Linux Foundation",
"viruta.org" => "Federico Mena Quintero",
"visualstudiomagazine.com" => "Visual Studio Magazine",
"vl8r.eu" => "Vincent Lequertier",
"vmiklos.hu" => "Miklos Vajna",
"vpv.kapsi.fi" => "Ville-Pekka Vainio",
"weblog.infoworld.com" => " IDG Communications Inc",
"infoworld.com" => " IDG Communications Inc",
"www.infoworld.com" => " IDG Communications Inc",
"webmink.com" => "Simon Phipps",
"websvn.kde.org" => "KDE",
"whitepapers.theregister.com" => "The Register",
"who-t.blogspot.com" => "Peter Hutterer",
"wiki.archcraft.io" => "Archcraft",
"wiki.archlinux.org" => "ArchLinux",
"wiki.ubuntu.com" => "Ubuntu",
"win.gg" => "WIN.gg",
"wingolog.org" => "Andy Wingo",
"warnerbros.com" => "Warner Bros Entertainment Inc",
"wip.warnerbros.com" => "Warner Bros Entertainment Inc",
"filippo.io" => "Filippo Valsorda",
"words.filippo.io" => "Filippo Valsorda",
"wordsmith.social" => "Fediverse",
"www.2daygeek.com" => "2DayGeek",
"www.3ders.org" => "3Ders",
"www.absolutelinux.org" => "Absolute Linux",
"www.addtoany.com" => "Add2Any",
"www.admin-magazine.com" => "Linux New Media USA LLC",
"www.aerospacemanufacturinganddesign.com" => "GIE Media Inc",
"www.alphr.com" => "Box 20 LLC",
"www.alsa-project.org" => "Advanced Linux Sound Architecture",
"www.andreas-loibl.de" => "Andreas Loibl",
"www.archlinux.org" => "Arch Linux",
"www.arklinux.org" => "Ark Linux",
"www.artificialintelligence-news.com" => "AI News",
"www.aryank.in" => "Aryan Kaushik",
"www.avm.de" => "AVM Computersysteme Vertriebs GmbH",
"www.axivion.com" => "Axivion",
"www.backports.org" => "Debian",
"www.badvoltage.org" => "Bad Voltage",
"www.bbntimes.com" => "BBN Times",
"www.berrange.com" => "Daniel P. Berrangé",
"www.betanews.com" => "BetaNews",
"www.bgr.in" => "Broad Guidance & Ratings",
"www.binaryemotions.com" => "Binary Emotions",
"www.binarytides.com" => "BinaryTides",
"www.blackmoreops.com" => "blackMORE Ops",
"www.brunolinux.com" => "Bruno Linux",
"www.burdell.org" => "David Cantrell",
"www.businessweek.com" => "Bloomberg Businessweek",
"www.cfg2html.com" => "Config to HTML",
"www.channel4.com" => "Channel Four Television Corporation",
"www.cio.com" => "IDG Communications Inc",
"www.cloudlinux.com" => "CloudLinux OS",
"www.cnbctv18.com" => "CNBC",
"www.codecoffee.com" => "Sean Robinson",
"www.codeplex.com" => "Microsoft Corporation",
"www.commandlinefu.com" => "Dan's Tools",
"www.cravingtech.com" => "Craving Tech",
"www.cyberkendra.com" => "Cyber Kendra",
"www.cyberpratibha.com" => "Cyber Pratibha",
"www.datacenterknowledge.com" => "DataCenter Knowledge",
"www.datafix.com.au" => "Dr Bob Mesibov",
"www.datamation.com" => "TechnologyAdvice",
"www.debian.org" => "Debian",
"www.debianhelp.co.uk" => "Debian",
"www.debianpure.com" => "Debian",
"www.debuntu.org" => "Ubuntu",
"www.decadent.org.uk" => "Ben Hutchings and Natalie Mayer-Hutchings",
"www.deepin.org" => "Deepin Linux",
"www.demorecorder.com" => "DI Christian Linhart GmbH",
"www.desktoplinux.com" => "Ziff Davis Enterprise",
"www.developer.com" => "TechnologyAdvice",
"www.digikam.org" => "digiKam",
"www.digitalinformationworld.com" => "DigitalInformationWorld",
"www.distrowatch.com" => "DistroWatch",
"www.dqindia.com" => "Cyber Media Ltd",
"www.droid-life.com" => "Droid Life",
"www.droidgamers.com" => "Droid Gamers",
"www.droplinegnome.net" => "Dropline Gnome",
"www.droplinegnome.org" => "Dropline Gnome",
"www.drydeadfish.co.uk" => "Dry Dead Fish",
"www.eclipse.org" => "IBM",
"www.edubuntu.org" => "Ubuntu",
"www.electriccitymagazine.ca" => "Electric City Magazine",
"www.elivecd.org" => "Elive",
"www.enricozini.org" => "Enrico Zini",
"www.enterprisestorageforum.com" => "Enterprise Storage Forum",
"www.everydaylinuxuser.com" => "Everyday Linux User",
"www.exploit-db.com" => "Exploit Database",
"www.extix.se" => "ExTiX Linux",
"www.eyrie.org" => "Russ Allbery",
"www.fairphone.com" => "Fairphone",
"www.fantasyflightgames.com" => "Fantasy Flight Publishing Inc",
"www.fedorafaq.org" => "The Unofficial Fedora® FAQ",
"www.ffmpeg.org" => "FFmpeg",
"www.flickr.com" => "Flickr",
"www.forbesindia.com" => "Forbes India",
"www.fosslicious.com" => "FOSSlicious",
"www.fossmint.com" => "FOSSMint",
"www.foxlinux.org" => "FrozenTech",
"www.fsl.cs.sunysb.edu" => "Stony Brook University",
"www.fuss.bz.it" => "FUSS",
"www.gadgetbridge.com" => "Gadget Bridge",
"www.gearnews.com" => "Remise 3 Medienservice GmbH",
"www.gearrice.com" => "Gearrice",
"www.gizbot.com" => "Gizbot",
"www.gizmodo.com.au" => "Gizmodo Australia",
"www.gnuworldorder.info" => "GNU World Order",
"www.goodbyemicrosoft.net" => "Goodbye, Microsoft®",
"www.google.com" => "Google",
"www.gotechtor.com" => "Gotechtor LLC",
"www.guidingtech.com" => "Guiding Tech",
"www.guru3d.com" => "The Guru of 3D",
"www.hadess.net" => "Bastien Nocera",
"www.hardwaretimes.com" => "HardwareTimes",
"www.hollywood.com" => "Hollywood.com LLC",
"www.ibiblio.org" => "Ibiblio",
"www.imdb.com" => "The Internet Movie Database",
"www.inconnect.de" => "INconnect GmbH",
"www.indidea.org" => "Gaël Duval",
"www.internetnews.com" => "InternetNews",
"www.iottechtrends.com" => "Uqnic Network Pte Ltd",
"www.itbusinessedge.com" => "TechnologyAdvice",
"www.itjungle.com" => "IT Jungle",
"www.itwire.com.au" => "ITWire",
"www.jankratochvil.net" => "Jan Kratochvil",
"www.jlekstrand.net" => "Jason and Laura Ekstrand",
"www.jonobacon.com" => "Jono Bacon",
"www.junauza.com" => "TechSource",
"www.kalitutorials.net" => "Kali Tutorials",
"www.kateos.org" => "Kateo",
"www.kde-apps.org" => "KDE",
"www.kde-look.org" => "KDE",
"www.kde.org" => "KDE",
"www.kernel.org" => "Linux",
"www.kitguru.net" => "KitGuru",
"www.knopper.net" => "Klaus Knopper",
"www.knoppix.com" => "Knoppix",
"www.knoppix.net" => "Knoppix",
"www.knoppix.org" => "Knoppix",
"www.kstuff.org" => "KDE",
"store.kde.org" => "KDE",
"www.kwort.org" => "Kwort Linux",
"www.lifehacker.com.au" => "Pedestrian Group",
"www.linoxide.com" => "BTreme",
"www.linux-live.org" => "Linux Live Kit",
"www.linux-on-laptops.com" => "LinuxCertified Inc",
"www.linux-wizard.net" => "Linux Wizard",
"www.linux.com" => "Linux",
"www.linuxandubuntu.com" => "LinuxAndUbuntu.com",
"www.linuxcertified.com" => "LinuxCertified Inc",
"www.linuxcommand.org" => "William E Shotts Jr",
"www.linuxforfreshers.com" => "LinuxForFreshers",
"www.linuxformat.co.uk" => "Linux Format",
"www.linuxformat.com" => "Linux Format",
"www.linuxfromscratch.org" => "Linux From Scratch",
"www.linuxmint.com" => "Linux Mint",
"www.linuxsecurity.com" => "Guardian Digital Inc",
"www.linuxtechmore.com" => "Linux-Tech & More",
"www.linuxtechnews.com" => "LinuxTechNews",
"www.linuxtoday.com" => "LinuxToday",
"www.linuxtrainingacademy.com" => "Linux Training Academy",
"www.linuxtuto.com" => "Linux Tutorials",
"www.linuxuprising.com" => "Linux Uprising",
"www.linuxuserspace.show" => "Linux User Space",
"www.linuxvoice.com" => "Linux Voice",
"www.lisenet.com" => "Tomas Nevar",
"www.madelinepeck.com" => "Madeline Peck",
"www.makululinux.com" => "Makulu Linux",
"www.mandiant.com" => "Mandiant",
"www.medibuntu.org" => "Mediabuntu",
"www.memorysafety.org" => "Internet Security Research Group",
"www.memtest.org" => "Memtest86",
"www.microsoft.com" => "The Beast in Redmond",
"www.mono-project.com" => "Microsoft Corporation",
"www.montanalinux.org" => "Montana Linux",
"www.mozilla.com" => "Mozilla Corporation",
"www.mozilla.org" => "Mozilla Corporation",
"www.mplayerhq.hu" => "MPlayer",
"www.mycomputertips.co.uk" => "My Computer Tips",
"www.mypclinuxos.com" => "PCLinuxOS",
"www.nepalnews.com" => "NepalNews",
"www.newtelegraphng.com" => "Daily Telegraph Publishing Company Ltd",
"www.nextpit.com" => "NextPit",
"www.nimblex.net" => "NimbleX",
"www.nongnu.org" => "Savannah",
"www.noobslab.com" => "NoobsLab",
"www.novell.com" => "Novell",
"www.nvidia.com" => "Nvidia Corporation",
"www.ocsmag.com" => "Open Content & Software Magazine",
"www.on-disk.com" => "Windows Bulletin Tutorials",
"www.online-tech-tips.com" => "Online Tech Tips",
"www.opensourceforu.com" => "Open Source For U",
"www.opensuse.computerlanguages.org" => "OpenSUSE",
"www.opensuse.org" => "OpenSUSE",
"www.osalt.com" => "Open Source As Alternative",
"www.ostechnix.com" => "Open Source, Technology, Nix*",
"www.parabola.nu" => "Paranola Project",
"www.pcgamesn.com" => "PCGamesN",
"www.pcpro.co.uk" => "PC Pro",
"www.pentestpartners.com" => "Pen Test Partners",
"www.pocketgamer.com" => "Steel Media Ltd",
"www.politico.com" => "Politico LLC",
"www.pulseaudio.org" => "Freedesktop",
"www.putorius.net" => "Putorius",
"www.q4os.org" => "Q4OS",
"www.qubes-os.org" => "Qubes OS",
"www.rastersoft.com" => "Raster Software",
"www.reactos.org" => "ReactOS",
"www.reality2cast.com" => "Reality 2.0",
"www.realwire.com" => "Realwire Ltd",
"www.redhat.com" => "Red Hat",
"www.reviewgeek.com" => "LifeSavvy Media",
"www.sapeople.com" => "SA-People Ltd",
"www.scmagazine.com" => "SC Media",
"www.securities.io" => "Securities.io",
"www.securitynewspaper.com" => "Information Security Newspaper",
"www.serverwatch.com" => "TechnologyAdvice",
"www.slackbook.org" => "Slackware",
"www.slackbuilds.org" => "Slackware",
"www.slackersbible.org" => "Slackware",
"www.slackware.com" => "Slackware",
"www.slackware.org" => "Slackware",
"www.slax.org" => "Slax",
"www.softmaker.com" => "SoftMaker",
"www.sonypictures.com" => "Sony Pictures Digital Productions Inc",
"www.southwestreviewnews.com" => "South West Review",
"www.strongdm.com" => "strongDM",
"www.stumbleupon.com" => "StumbleUpon",
"www.supergoodcode.com" => "Mike Blumenkrantz",
"www.suseforums.net" => "OpenSUSE",
"www.syslog-ng.com" => "One Identity LLC",
"www.t3.com" => "Future Publishing Ltd",
"www.talkandroid.com" => "Talk Android",
"www.techadvisor.com" => "IDG Communications Inc",
"www.techdrivein.com" => "Tech Drive-In",
"www.techgenyz.com" => "TechGenyz",
"www.technewstoday.com" => "TechNewsToday",
"informationweek.om" => "Informa Tech",
"www.tfir.io" => "TFIR",
"www.theautochannel.com" => "The Auto Channel",
"www.thegeekstuff.com" => "Ramesh Natarajan",
"www.theinterpretermovie.com" => "The Interpreter",
"www.tldp.org" => "The Linux Documentation Project",
"www.tllts.org" => "The Linux Link Tech Show",
"www.townoflaronge.ca" => "La Ronge Northerner",
"www.trustedreviews.com" => "TrustedReviews Ltd",
"www.ubuntu-unleashed.com" => "Ubuntu",
"www.ubuntu.com" => "UBuntu",
"www.ubuntuforums.org" => "Ubuntu",
"www.ubuntufree.com" => "Ubuntu",
"www.ubuntugeek.com" => "Ubuntu",
"www.univention.com" => "Univention GmbH",
"www.unixtutorial.org" => "Unix Tutorial",
"www.vcn.bc.ca" => "Vancouver Community Network",
"www.vegardno.net" => "Vegard Nossum",
"www.vim.org" => "Vim",
"www.vmware.com" => "VMWare",
"www.wxpython.org" => "Pythin",
"www.xfce-look.org" => "XFCE",
"www.xfce.org" => "XFCE",
"www1.mandrivalinux.com" => "Mandriva",
"www2.mandriva.com" => "Mandriva",
"wwwnew.mandriva.com" => "Mandriva",
"wxwidgets.org" => "wxWigets",
"xbmc.org" => "Kodi",
"kodi.tv" => "Kodi",
"xfce.org" => "XFCE",
"xmodulo.com" => "Xmodulo",
"xonotic.org" => "Xonotic",
"xubuntu.org" => "Ubuntu",
"yast.opensuse.org" => "OpenSUSE",
);
return(exists($lookup{$site}) ? $lookup{$site} : 0);
}
./frontpage.php
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<title>Welcome to Tux Machines</title>
<link rel="alternate" type="application/rss+xml" href="http://news.tuxmachines.org/feed.xml" title="Tux Machines" />
<style type="text/css" media="screen">
hr {
overflow: visible; /* For IE */
padding: 0;
border: none;
border-top: medium double #333;
color: #333;
text-align: center;
}
abbr[title]
{
border-bottom: 1px dashed #ADADAD;
text-decoration: none;
cursor: help;
}
hr:after {
content: "Tux Machines";
display: inline-block;
position: relative;
top: -0.7em;
font-variant: small-caps;
font-size: 0.5em;
padding: 0 0.25em;
background: white;
}
img {
float: center;
padding: 0.3em 0.1em 0.3em 0.1em;
box-shadow: 0.4em 0.4em 0.4em #222;
border: medium solid #aaa;
border-radius: 2.5em;
margin-top: -0.5em;
margin-right: 1em;
margin-bottom: 1em;
max-width: 30%;
}
tr.odd {
/* background: #eee; */
background: #eee; /* Old browsers */
background: -moz-linear-gradient(top, #ffffff 0%, #f1f1f1 50%, #e1e1e1 51%, #f6f6f6 100%); /* FF3.6+ */
background: -webkit-gradient(linear, left top, left bottom, color-stop(0%,#ffffff), color-stop(50%,#f1f1f1), color-stop(51%,#e1e1e1), color-stop(100%,#f6f6f6)); /* Chrome,Safari4+ */
background: -webkit-linear-gradient(top, #ffffff 0%,#f1f1f1 50%,#e1e1e1 51%,#f6f6f6 100%); /* Chrome10+,Safari5.1+ */
background: -o-linear-gradient(top, #ffffff 0%,#f1f1f1 50%,#e1e1e1 51%,#f6f6f6 100%); /* Opera 11.10+ */
background: -ms-linear-gradient(top, #ffffff 0%,#f1f1f1 50%,#eaeaea 51%,#f6f6f6 100%); /* IE10+ */
background: linear-gradient(to bottom, #ffffff 0%,#f1f1f1 50%,#eaeaea 51%,#f6f6f6 100%); /* W3C */
filter: progid:DXImageTransform.Microsoft.gradient( startColorstr='#ffffff', endColorstr='#f6f6f6',GradientType=0 ); /* IE6-9 */
}
tr.even {
/* background: #ccc; */
background: #ccc; /* Old browsers */
background: -moz-linear-gradient(top, #f2f6f8 0%, #d8e1e7 50%, #b5c6d0 51%, #e0eff9 100%); /* FF3.6+ */
background: -webkit-gradient(linear, left top, left bottom, color-stop(0%,#f2f6f8), color-stop(50%,#d8e1e7), color-stop(51%,#b5c6d0), color-stop(100%,#e0eff9)); /* Chrome,Safari4+ */
background: -webkit-linear-gradient(top, #d2f6f8 0%,#d8e1e7 50%,#b5c6d0 51%,#e0eff9 100%); /* Chrome10+,Safari5.1+ */
background: -o-linear-gradient(top, #fff6f8 0%,#d8e1e7 50%,#d5d6d0 51%,#e0eff9 100%); /* Opera 11.10+ */
background: -ms-linear-gradient(top, #fff6f8 0%,#d8e1e7 50%,#d5d6d0 51%,#f0eff9 100%); /* IE10+ */
background: linear-gradient(to bottom, #fff6f8 0%,#d8e1e7 50%,#d5dedd 51%,#f0eff9 100%); /* W3C */
filter: progid:DXImageTransform.Microsoft.gradient( startColorstr='#f2f6f8', endColorstr='#f0fff9',GradientType=0 ); /* IE6-9 */
}
h1 {
font-size: 180%;
font-variant: small-caps;
text-shadow: #bbb 2px 2px 5px;
}
h2 {
font-size: 125%;
font-variant: small-caps;
text-align: center;
text-shadow: #bbb 1px 1px 2px;
}
h3 h4 {
font-size: 110%;
text-shadow: #555 1px 1px 2px;
}
</style>
</head>
<body bgcolor="#eeeeee" >
<table style="border-style:groove;" width="100%" cellspacing="4">
<tbody>
<tr class="odd">
<td>
<?php
// prints something like: Wednesday the 15th
echo date(DATE_RFC2822);
?>
<hr />
</td>
<td align="center"><h1><u>Tux Machines</u></h1>
<font size="4"><em>"The price of freedom is eternal vigilance."</em></font>
<p align="right">
~ <font size="3">Thomas Jefferson </a></font>
</p>
</td>
<td>
<h4><abbr title="A lightweight alternative to the World Wide Web">Gemini</abbr> ䷉</h4>
<ul>
<li><a href="gemini://gemini.tuxmachines.org/">Tux Machines on Gemini</a></li></li>
</ul>
<hr />
<h4>Gallery ✇</h4>
<ul>
<li><a href="http://www.tuxmachines.org/gallery/main.php">GNU/Linux screenshots</a></li>
</ul>
<hr />
</td>
</tr>
<tr class="even">
<td valign="baseline" width="15%">
<h2>Daily Picks</h2>
<ul>
<li><a href="http://news.tuxmachines.org/">The latest (updated daily)</a></li>
</ul>
<hr />
<h2>Common Destinations</h2>
<dl>
<dd><a href="http://www.tuxmachines.org/node">Older site</a></dd>
<dt>Drupal-based site, now archived</dt>
<dd><a href="http://www.tuxmachines.org/Search">Site search</a></dd>
<dt>Our 2004-2022 archives, nearly 170,000 pages indexed and catalogued</dt>
<dd><a href="http://www.tuxmachines.org/aggregator/sources">News aggregator</a></dd>
<dt>An antenna for GNU/Linux news, still active and up to date</dt>
<dd><a href="http://www.tuxmachines.org/blog">Blogs</a></dd>
<dt>Editorials and site news, now legacy as we've moved to our new CMS</dt>
<dd><a href="https://diaspora.psyco.fr/people/dfba90307f78013ad90d001e67d879df">Diaspora</a></dd>
<dt>JoinDiaspora has shut down, but we're still active in another pod</dt>
<dd><a href="https://mastodon.social/@tuxmachines">Mastodon</a></dd>
<dt>Follow our updates in the Fediverse</dt>
<dd><a href="https://twitter.com/tuxmachines">Twitter</a></dd>
<dt>Proprietary network, but still widely used</dt>
</dl>
<h2>Techrights</h2>
<?php
$rss = new DOMDocument();
$rss->load('http://techrights.org/feed');
$feed = array();
foreach ($rss->getElementsByTagName('item') as $node) {
$item = array (
'title' => $node->getElementsByTagName('title')->item(0)->nodeValue,
'desc' => $node->getElementsByTagName('description')->item(0)->nodeValue,
'link' => $node->getElementsByTagName('link')->item(0)->nodeValue,
'date' => $node->getElementsByTagName('pubDate')->item(0)->nodeValue,
);
array_push($feed, $item);
}
$limit = 18;
for($x=0;$x<$limit;$x++) {
$date = date('l F d, Y', strtotime($feed[$x]['date']));
$title = str_replace(' & ', ' & ', $feed[$x]['title']);
$link = $feed[$x]['link'];
$description = $feed[$x]['desc'];
echo '<h4><strong>⚓ Post #'.$x.': <a href="'.$link.'" title="'.$title.'">'.$title.'</a></strong></h4>';
echo '<small><em>Posted on '.$date.'</em></small></p>';
echo '<p>'.$description.'</p>';
}
?>
<hr />
</td>
<td width="65%">
<h2>Latest Images</h2>
<img src="http://news.tuxmachines.org/Features/latest" alt="Latest image" /> <img src="http://news.tuxmachines.org/Features/second-latest" alt="Second latest image" />
<h2>Latest Additions</h2>
<?php
$rss = new DOMDocument();
$rss->load('http://news.tuxmachines.org/feed.xml');
$feed = array();
foreach ($rss->getElementsByTagName('item') as $node) {
$item = array (
'title' => $node->getElementsByTagName('title')->item(0)->nodeValue,
'desc' => $node->getElementsByTagName('description')->item(0)->nodeValue,
'link' => $node->getElementsByTagName('link')->item(0)->nodeValue,
'date' => $node->getElementsByTagName('pubDate')->item(0)->nodeValue,
);
array_push($feed, $item);
}
?>
<?php
$limit = 50;
for($x=0;$x<$limit;$x++) {
if (strpos($feed[$x]['title'], 'UPDATED') == false) {
$date = date('l F d, Y', strtotime($feed[$x]['date']));
$title = str_replace(' & ', ' & ', $feed[$x]['title']);
$link = $feed[$x]['link'];
$description = $feed[$x]['desc'];
echo '<h4><strong>⚓ Post #'.$x.': <a href="'.$link.'" title="'.$title.'">'.$title.'</a></strong></h4>';
echo '<small><em>Posted on '.$date.'</em></small></p>';
echo '<p>'.$description.'</p>';
}
}
?>
<hr />
<h2>This Week in History</h2>
Vacant for now, work in progress.
</td>
<td valign="baseline">
<h4><abbr title="Follow the site using XML-based indices">Syndication</a> ℜ</h3>
<a href="http://news.tuxmachines.org/feed.xml">RSS 2.0</a>
<hr />
<a href="http://news.tuxmachines.org/irc.shtml" title="Enter IRC channels">Contact us</a> (<abbr title="Internet Relay Chat (IRC), an application layer protocol that facilitates communication in the form of text">IRC</abbr> chat)
<hr />
For <abbr title="Use end-to-end encryption to ensure only the recipient of messages and accompanying material can see the contents">privacy</abbr>: <a href="http://schestowitz.com/PGP/" title="PGP Key">encrypted/PGP</a>
<hr />
<h2>Latest Updates</h2>
<?php
$limit = 50;
for($x=0;$x<$limit;$x++) {
if (strpos($feed[$x]['title'], 'UPDATED') !== false) {
# if (preg_match('00\:00\:00', $date)) {
$date = date('l F d, Y', strtotime($feed[$x]['date']));
$title = str_replace(' & ', ' & ', $feed[$x]['title']);
$link = $feed[$x]['link'];
$description = $feed[$x]['desc'];
echo '<h4><strong>⚓ Post #'.$x.': <a href="'.$link.'" title="'.$title.'">'.$title.'</a></strong></h4>';
echo '<small><em>Posted on '.$date.'</em></small></p>';
echo '<p>'.$description.'</p>';
# echo $date;
# echo $feed[$x]['date'];
}
}
?>
</td>
</tbody>
</table>
<p align="right">
<a href="mobile">Alternative version</a> (more mobile-friendly, for narrow screens and less colour)
</p>
</body>
./mobile.php
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta name="generator" content=
"HTML Tidy for HTML5 for Linux version 5.6.0" />
<meta http-equiv="Content-Type" content=
"text/html; charset=utf-8" />
<meta name="viewport" content=
"width=device-width, initial-scale=1.0, maximum-scale=2.0, user-scalable=2" />
<title>Welcome to Tux Machines</title>
<link rel="alternate" type="application/rss+xml" href="http://news.tuxmachines.org/feed.xml" title="Tux Machines" />
<style type="text/css" media="screen">
/*<![CDATA[*/
hr {
overflow: visible; /* For IE */
padding: 0;
border: none;
border-top: medium double #333;
color: #333;
text-align: center;
}
hr:after {
content: "Tux Machines";
display: inline-block;
position: relative;
top: -0.7em;
font-variant: small-caps;
font-size: 0.5em;
padding: 0 0.25em;
background: white;
}
h1 {
font-size: 280%;
font-variant: small-caps;
text-shadow: #bbb 0.1em 0.1em 0.2em;
text-align: center;
}
h2 {
font-size: 125%;
font-variant: small-caps;
text-align: center;
text-shadow: #bbb 0.1em 0.1em 0.2em;
}
h3 h4 {
font-size: 110%;
text-shadow: #555 0.1em 0.1em 0.2em;
}
body {
float: none;
}
div div.common {
border: thick solid #eee;
background: #eee;
float: left;
}
div div.main {
border: thick solid #fff;
background: #fff;
float: left;
width: 66%;
}
div div.syndication {
border: thick solid #eee;
background: #eee;
float: left;
}
div div.main p.posted {
font-size: 80%;
font-style: italics;
padding-top: 0;
padding-bottom: 0;
margin-top: 0;
margin-bottom: 0;
}
div div.main p {
padding-left: 2em;
}
div.footer {
clear: left;
}
p.date {
font-size: 80%;
text-align: center;
}
img {
float: center;
padding: 0.3em 0.1em 0.3em 0.1em;
box-shadow: 0.4em 0.4em 0.4em #222;
border: medium solid #aaa;
border-radius: 2.5em;
margin-top: -0.5em;
margin-right: 1em;
margin-bottom: 1em;
max-width: 30%;
}
/*]]>*/
</style>
</head>
<body>
<div class="foo">
<div class="common">
<h2>Daily Picks</h2>
<ul>
<li><a href="http://news.tuxmachines.org/">The latest (updated daily)</a></li>
</ul>
<hr />
<h2>Common Destinations</h2>
<dl>
<dd><a href="http://www.tuxmachines.org/node">Older site</a></dd>
<dt>Drupal-based site, now archived</dt>
<dd><a href="http://www.tuxmachines.org/Search">Site search</a></dd>
<dt>Our 2004-2022 archives, nearly 170,000 pages indexed and catalogued</dt>
<dd><a href="http://www.tuxmachines.org/aggregator/sources">News aggregator</a></dd>
<dt>An antenna for GNU/Linux news, still active and up to date</dt>
<dd><a href="http://www.tuxmachines.org/blog">Blogs</a></dd>
<dt>Editorials and site news, now legacy as we've moved to our new CMS</dt>
<dd><a href="https://diaspora.psyco.fr/people/dfba90307f78013ad90d001e67d879df">Diaspora</a></dd>
<dt>JoinDiaspora has shut down, but we're still active in another pod</dt>
<dd><a href="https://mastodon.social/@tuxmachines">Mastodon</a></dd>
<dt>Follow our updates in the Fediverse</dt>
<dd><a href="https://twitter.com/tuxmachines">Twitter</a></dd>
<dt>Proprietary network, but still widely used</dt>
</dl>
</div>
<div class="main">
<h1><u>Tux Machines</u></h1>
<h2>Latest Images</h2>
<img src="http://news.tuxmachines.org/Features/latest" alt="Latest image" /> <img src="http://news.tuxmachines.org/Features/second-latest" alt="Second latest image" />
<h2>Latest Updates</h2>
<?php
$rss = new DOMDocument();
$rss->load('http://news.tuxmachines.org/feed.xml');
$feed = array();
foreach ($rss->getElementsByTagName('item') as $node) {
$item = array (
'title' => $node->getElementsByTagName('title')->item(0)->nodeValue,
'desc' => $node->getElementsByTagName('description')->item(0)->nodeValue,
'link' => $node->getElementsByTagName('link')->item(0)->nodeValue,
'date' => $node->getElementsByTagName('pubDate')->item(0)->nodeValue,
);
array_push($feed, $item);
}
$limit = 50;
for($x=0;$x<$limit;$x++) {
if (strpos($feed[$x]['title'], 'UPDATED') !== false) {
# if (preg_match('00\:00\:00', $date)) {
$date = date('l F d, Y', strtotime($feed[$x]['date']));
$title = str_replace(' & ', ' & ', $feed[$x]['title']);
$link = $feed[$x]['link'];
$description = $feed[$x]['desc'];
echo '<h4><strong>⚓ Post #'.$x.': <a href="'.$link.'" title="'.$title.'">'.$title.'</a></strong></h4>';
echo '<small><em>Posted on '.$date.'</em></small></p>';
echo '<p>'.$description.'</p>';
# echo $date;
# echo $feed[$x]['date'];
}
}
?>
<hr />
<h2>This Week in History</h2>
</div>
<div class="syndication">
<h4><abbr title="Follow the site using XML-based indices">Syndication</a> ℜ</h3>
<a href="http://news.tuxmachines.org/feed.xml">RSS 2.0</a>
<h2>Latest Additions</h2>
<?php
$limit = 50;
for($x=0;$x<$limit;$x++) {
# if (strpos($feed[$x]['date'], '00:00') == false) {
if (strpos($feed[$x]['title'], 'UPDATED') == false) {
$date = date('l F d, Y', strtotime($feed[$x]['date']));
$title = str_replace(' & ', ' & ', $feed[$x]['title']);
$link = $feed[$x]['link'];
$description = $feed[$x]['desc'];
echo '<h4><strong>⚓ Post #'.$x.': <a href="'.$link.'" title="'.$title.'">'.$title.'</a></strong></h4>';
echo '<small><em>Posted on '.$date.'</em></small></p>';
echo '<p>'.$description.'</p>';
}
}
?>
</div>
</div>
<div class="footer">
<hr />
<a href="http://news.tuxmachines.org/irc.shtml" title="Enter IRC channels">Contact us</a> (<abbr title="Internet Relay Chat (IRC), an application layer protocol that facilitates communication in the form of text">IRC</abbr> chat)
<hr />
For <abbr title="Use end-to-end encryption to ensure only the recipient of messages and accompanying material can see the contents">privacy</abbr>: <a href="http://schestowitz.com/PGP/" title="PGP Key">encrypted/PGP</a>
<p class="date">
<?php
// prints something like: Wednesday the 15th
echo date(DATE_RFC2822);
?>
</p>
</div>
</body>
</html>
./get-latest-feature-image.sh
#!/bin/bash # # Get a pair of newly-added images # cd /var/www/tuxmachines.org/htdocs/Features/ month=$(date +%m) year=$(date +%Y) cd /var/www/tuxmachines.org/htdocs/i/$year/$month/ ls -Art | tail -n 4 | grep -v latest | tail -n1 | xargs cat > /var/www/tuxmachines.org/htdocs/Features/latest ls -Art | tail -n 4 | grep -v latest | head -n1 | xargs cat > /var/www/tuxmachines.org/htdocs/Features/second-latest
./404.shtml
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Tux Machines - Client Error: 404 Not Found</title>
<meta name="dc.date.created" content="20220712" />
<meta name="dc.description" content="Do you waddle the waddle" />
<link rel="stylesheet" href="/CSS/tuxmachines.css"
media="screen" type="text/css" />
<style type="text/css" media="print">
article a {
font-weight: bolder;
text-decoration: none;
}
article a[href^=http]:after {
content:" <" attr(href) "> ";
}
</style>
<meta name="DC.Creator" content="Tux Machines" />
<link rel="alternate" title="Tux Machines RSS"
href="/feed.xml" type="application/rss+xml" />
</head>
<body>
<!--#include virtual="/header.html"-->
<!--#include virtual="/feeds.html"-->
<div class="error">
<!--#include virtual="/navigation.html"-->
<h1>404 Not Found</h1>
<p class="notfound">
<em><!--#echo var="REQUEST_URI" --></em>
</p>
<p>
The link you were looking for could not be found.
</p>
</div>
<h1 class="recent">Recent Tux Machines Posts</h1>
<!--#include virtual="/latest-news.html"-->
<!--#include virtual="/footer.html"-->
</body>
</html>
./tm-extract-summary.pl
#!/usr/bin/perl
# 2023-01-25
# fetches posts from the database and makes an HTML DL list based
# on author and title with the description, grouped by date
use utf8;
use Getopt::Long;
use Date::Calc qw/Today Add_Delta_YM Add_Delta_YMD/;
use DBI qw(:sql_types);
use HTML::TreeBuilder::XPath;
use HTML::Entities qw/encode_entities_numeric decode_entities/;
# use Data::Dumper qw/Dumper/;
use English;
use strict;
use warnings;
our $dbfile="/var/www/tuxmachines.org/db/tm-static-site-generator.sqlite3";
our %opt;
our $VERBOSE = 0;
GetOptions ("date=s" => \$opt{'d'},
"help" => \$opt{'h'},
"verbose+" => \$opt{'v'},
);
my $script = $0;
if (defined($opt{'h'})) {
&usage($script);
}
if (defined($opt{'v'})) {
$VERBOSE = $opt{'v'};
}
my ($year, $month, $day) = &get_date($opt{'d'});
$opt{'s'} = 1;
if ($opt{'s'}) {
print "Starting Date: $year/$month/$day\n" if ($VERBOSE);
} else {
print "Date: $year/$month/$day\n" if ($VERBOSE);
}
&extract_and_write($year,$month,$day);
exit(0);
sub usage {
my ($script) = (@_);
print "USAGE:\n\n";
print "$script [-hv] [-d date]\n\n";
print " -d, --date date as YYYYMMDD, defaults to a month ago\n";
print " -v, --verbose show debugging info\n";
print " -h, --help show this message\n";
print "\n";
print "Summmarize posts by title and author, grouped by date, since ";
print "the designated date. If no date is given, then start from ";
print "one month ago.\n";
print "\n";
exit(0);
}
# validate and return date from option XOR current date minus one month
sub get_date {
my ($d) = (@_);
my ($year, $month, $day);
my $date = '';
if ($d) {
if ( ($date) = ($d =~ m/^([0-9]{4}-[0-9]{2}-[0-9]{2})$/)
or
($date) = ($d =~ m/^([0-9]{4}[0-9]{2}[0-9]{2})$/)
) {
$date =~ s/-//g;
}
if (!$date) {
print STDERR qq(Invalid date '$d'\n);
exit(1);
}
($year,$month,$day) =
($date =~ m/^([0-9]{4})([0-9]{2})([0-9]{2})$/);
if (! check_date($year,$month,$day)) {
print STDERR qq(Invalid date '$date', );
print STDERR qq(Use YYYY-MM-DD'\n);
exit(1);
}
}
# if no date was provide, start from a month ago
if (!$date) {
($year,$month,$day) = Today(1); # get date GMT
($year,$month,$day) = Add_Delta_YM($year,$month,$day,0,-1);
($year,$month,$day) = Add_Delta_YMD($year,$month,$day,0,0,1);
$year = sprintf("%04d", $year);
$month = sprintf("%02d", $month);
$day = sprintf("%02d", $day);
}
return($year, $month, $day);
}
# get the relevant records from the database and convert to HTML
sub extract_and_write {
my ($year,$month,$day) = (@_);
my $summary = &extract($year,$month,$day);
if (!$summary) {
$summary = qq(<p>No records since $year-$month-$day</p>\n);
}
my $html = &new_xhtml_document($year,$month,$day,$summary);
print $html;
}
# get the relevant records from the SQLite3 database
sub extract {
my ($year,$month,$day) = (@_);
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbfile", undef, undef,
{ AutoCommit => 0, RaiseError => 1 })
or die("Could not open database '$dbfile': $!\n");
my $date = "$year-$month-$day";
# fetch relevant records, starting with specified date
my $sth = &query($date, $dbh);
# process found records into a sortable hash
my $count = 0;
my %record = ();
while (my $data = $sth->fetchrow_hashref) {
my $recno = $data->{'recno'};
my $date = substr($data->{'ts'},0,10);
my $timestamp = $data->{'ts'};
my $author = $data->{'author'};
my $title = $data->{'title'};
my $description = $data->{'description'};
$record{$recno}->{'date'} = $date;
$record{$recno}->{'timestamp'} = $timestamp;
$record{$recno}->{'author'} = $author;
$record{$recno}->{'title'} = $title;
$record{$recno}->{'description'} = $description;
my $ballast = $data->{'ballast'};
my $slug = $data->{'slug'};
my $file;
if (!$ballast) {
$file = "$date$slug.shtml";
} else {
$file = "$date/$slug.$ballast.shtml";
}
$file =~ s{^([0-9]{4})-([0-9]{2})-([0-9]{2})} {$1/$2/$3/};
$record{$recno}->{'href'} = '/n/'.$file;
# number of records processed
$count++;
}
$sth->finish;
$dbh->disconnect;
my $oldDate = 0;
my $ddSummary = HTML::Element->new('dd'); # actual day
my $daySummary = HTML::Element->new('dl'); # wrapper for each day
my $summary = HTML::Element->new('dl'); # grand list of days
# sort hash of processed records and build HTML definition list(s)
for my $rec (sort {$record{$a}->{'date'} cmp $record{$b}->{'date'}
or $record{$a}->{'author'} cmp $record{$b}->{'author'}
or $record{$a}->{'timestamp'} cmp $record{$b}->{'timestamp'}
or $a cmp $b } keys %record) {
my $author = $record{$rec}->{'author'};
my $title = $record{$rec}->{'title'};
my $description = $record{$rec}->{'description'};
my $date = $record{$rec}->{'date'};
my $timestamp = $record{$rec}->{'timestamp'};
my $href = $record{$rec}->{'href'};
if ($VERBOSE) {
print "$rec: $date, $timestamp: $author\n";
print "\t$href\n";
}
# beginning of new day
if ($oldDate ne $date) {
$ddSummary->push_content($daySummary);
$summary->push_content($ddSummary);
# clear the buffers for each day and the day wrapper
$daySummary = HTML::Element->new('dl');
$ddSummary = HTML::Element->new('dd');
# add a defninition list title for the next date
my $dt = HTML::Element->new('dt');
$dt->push_content($date);
$summary->push_content($dt);
# remember working date
$oldDate = $date;
}
# build entry hyperlink to article
my $anchor = HTML::Element->new('a', 'href'=>$href);
$anchor->push_content($title);
my $dt = HTML::Element->new('dt'); # entry hyperlink + title
my $dd1 = HTML::Element->new('dd'); # entry author + description
$dt->push_content($anchor);
$dd1->push_content($author." : ".$description);
# add link+title, author+description to list for working date
$daySummary->push_content($dt);
$daySummary->push_content($dd1);
}
# harvest any remaining buffer content from the day and then its wrapper
$ddSummary->push_content($daySummary);
$summary->push_content($ddSummary);
if (!$count) {
if ($VERBOSE) {
print "No records processed.\n\n";
}
return(" <p>No records processed.</p>\n");
}
# convert to indented HTML with closing tags for each element
my $summaryhtml = $summary->as_HTML( '', ' ', {} );
$summary->delete;
return($summaryhtml);
}
# actually query the SQLite3 daabawse
sub query {
my ($date, $dbh) = (@_);
# $sth Statement handle object
my $sth;
# ts = full datetime stamp
# find date modified, author, title, description, and file name parts
my $query = qq(
SELECT recno, ts, author, title, description, ballast, slug
FROM (
SELECT recno, value AS ts
FROM metadata
WHERE term='dc.date.modified'
AND value>=?) AS T1
JOIN (
SELECT recno, value AS author
FROM metadata
WHERE term='dc.creator') AS T2
USING(recno)
JOIN (
SELECT recno, value AS title
FROM metadata
WHERE term='dc.title') AS T3
USING(recno)
JOIN (
SELECT recno, value AS description
FROM metadata
WHERE term='dc.description') AS T4
USING(recno)
JOIN (
SELECT recno, ballast, slug FROM keys ) AS T5
USING(recno)
ORDER BY SUBSTR(ts,1,10), author, ts desc;
);
$sth = $dbh->prepare($query)
or die "prepare statement failed: $dbh->errstr()\n";
$sth->execute($date)
or die "execute statement failed: $dbh->errstr()\n";
if ($VERBOSE > 1) {
print "Main Query= $query\n";
}
return($sth);
}
# fill in a template to create an HTML page
sub new_xhtml_document {
my ($year,$month,$day,$summary) = (@_);
my $html = <<"EOHTML";
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Tux Machines posts since $year-$month-$day</title>
<link rel="stylesheet" href="/CSS/tuxmachines.css"
media="screen" type="text/css" />
<link rel="alternate" type="application/atom+xml" href="/feed.xml"
title="Tux Machines" />
<link rel="shortcut icon" href="/Images/whitejazz_favicon_0.ico"
type="image/x-icon" /></head>
<body>
<!--#include virtual="/header.html"-->
<!--#include virtual="/feeds.html"-->
<div class="monthly">
<!--#include virtual="/navigation.html"-->
<h1>Tux Machines posts since $year-$month-$day</h1>
$summary
</div>
<!--#include virtual="/footer.html"-->
</body>
</html>
EOHTML
return($html);
}
Last updated Wed Sep 20 15:00:01 UTC 2023