User:Philosobot/Source code/phillists/update philosophers.pl

#!/usr/bin/perl
use strict;		      # 'strict' insists that all variables be declared
use diagnostics;	      # 'diagnostics' expands the cryptic warnings
use open 'utf8';

use lib $ENV{HOME} . '/public_html/wp/modules'; # path to perl modules

require 'bin/perlwikipedia_utils.pl';
require "strip_accents_and_stuff.pl";
require "bin/fetch_articles.pl";
require "read_from_write_to_disk.pl";
require "bin/get_last.pl";
require 'lists_utils.pl';
undef $/; # undefines the separator. Can read one whole file in one scalar.

MAIN:{

my $Editor=wikipedia_login();

my (@names, $dir, $file, $text, $birth, $death, $country, %country2nationality, $last, $name, $name_stripped);
my ($philosopher_prefix, $countries_file, $all_philosophers, $philosophers_logfile, $todays_log, $philosopher_cat_list);
my ($list_of_categories, @philosophy_categories, @philosopher_categories, @other_categories, @articles_from_cats, @new_categories);
my (%entries, $line, @lines, %people, $letter, $articles_from_cats_file, $sleep, $attempts,  %blacklist, $ndash, $edit_summary);
my  @letters=("A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z");


# files to be used
$countries_file='Countries.txt';
$philosopher_prefix='Index of philosophers'; # will add later (A), (B), ...
$articles_from_cats_file='All_philosophers_from_cats.txt';
$all_philosophers='All_philosophers.txt';
$philosophers_logfile='Philosophers_log.txt';
$philosopher_cat_list = "New_philosopher_categories.txt";

$list_of_categories='List_of_philosophy_categories.wiki';
# Get today's articles found in categories
&read_categories_from_list(\@philosophy_categories,\@philosopher_categories,\@other_categories, $list_of_categories);
&fetch_articles(\@philosopher_categories, \@articles_from_cats, \@new_categories);
open (FILE, ">", $philosopher_cat_list); print FILE join ("\n", @new_categories); close(FILE);

&put_redirects_on_blacklist (\%blacklist, $articles_from_cats_file, \@articles_from_cats);
&put_redlinks_on_blackphil($philosopher_prefix, \@letters, \%blackphil);

@articles_from_cats=&randomize_array(@articles_from_cats); # this has a purpose, to identify entries differning only by capitals

&read_countries ($countries_file, \%country2nationality);
&parse_new(\@articles_from_cats, \%country2nationality, \%entries);

# now, deal with the existing entries in the index of philosophers
$sleep = 5; $attempts=100; $text="";
foreach $letter (@letters){
$text = $text . "\n" . wikipedia_fetch($Editor, "$philosopher_prefix ($letter).wiki", $attempts, $sleep);
}
$text =~ s/\[\[Category:.*?\]\]//g; # rm any categories, those are not philosophers

# combine the data from the new index of philosophers (%entries) and the existing index ($text)
@lines = split("\n", $text);
foreach $line (@lines){

next unless ($line =~ /^\s*\*\s*\[\[(.*?)\]\]/);
$name=$1;

# Upcase. Something more robust will need to be put in here.
$name =~ s/^(.)/uc($1)/eg;

# add last, first if not there yet
if ($name !~ /\|/ ) {
$last = &get_last( $name ); # for a given name, try to guess the first name and the last name
$line =~ /^\s*\*\s*\[\[(.*?)\]\](.*?)$/;
$line = "* \[\[$name\|$last\]\]$2";
}

$name =~ s/\|.*?$//g; # strip pipe
next if (exists $blacklist{$name}); # ignore blacklist, just as above

# reconcile the new $entries{$name} with the old $line
if (exists $entries{$name} && $entries{$name} ne $line) {
print "Reconciling $name\n";
$entries{$name}=&reconcile ($line, $entries{$name});
}else{
$entries{$name}=$line;
}
}

# split into a number of hashes, by letter. Those hashes are keyed by an ascii version of the last nime, for sorting.
foreach $name (keys %entries ){

# strip the last name of accents and prefixes to be able to sort by it.
next unless ($entries{$name} =~ /^\s*\*\s*\[\[(.*?)\]\]/);
$last=$1; $last = &strip_last($last);

next unless ($last =~ /^([a-z])/i);
$letter=uc($1);
$people{$letter}->{$last}=$entries{$name};
}

foreach $letter (@letters){
&split_into_sections ($people{$letter});

$text = "__NOTOC__\n\{\{PhilTopicTOC\}\}\n";
foreach $last ( sort {$a cmp $b} keys %{$people{$letter}} ) {
#    $people{$letter}->{$last} =~ s/(\*\s*\[\[.*?\]\].*?\(.*?,)(.*?)$/&put_ndash($1, $2)/eg; # put ndash back
$text = $text . "$people{$letter}->{$last}\n";
}
$text = $text . "\n[[Category:Philosophy-related lists]]";

# These are groups of people, not individual philosophers
$text =~ s/(Bourbaki.*?\(.*?),.*?\n/$1\)\n/g;
$text =~ s/(Martians.*?\(.*?),.*?\n/$1\)\n/g;
$text =~ s/(Blanche Descartes.*?\(.*?),.*?\n/$1\)\n/g;

$edit_summary='Daily update. See [[User:Philosobot/Changes to phillists]] for changes.';
wikipedia_submit($Editor, "$philosopher_prefix ($letter).wiki", $edit_summary, $text, $attempts, $sleep);
}

# create log and write to disk. Later will integrate with philosophy articles log and submit
$todays_log=&process_log_of_todays_changes(\%entries, \%blacklist, $all_philosophers);
open(FILE, ">$philosophers_logfile");  print FILE "$todays_log";  close(FILE);

}

sub read_countries {
my (@countries, $countries_file, $country2nationality);

($countries_file, $country2nationality)=@_;

open (FILE, "<", $countries_file); # map from nationality to country
@countries=split("\n", <FILE>);
close(FILE);

foreach (@countries){
next unless (/^(.*?)\s*-\s*(.*?)\s*$/);
$country2nationality->{lc($1)}=$2;
}
$country2nationality->{"\?"}="\?"; # unknown country
}

sub parse_new {
my ($name, $text, $country, $birth, $death, $last);
my ($articles_from_cats, $country2nationality, $entries)=@_;

my $Editor=wikipedia_login();

# go through the articles, read them in and get necessary data.
foreach $name (@$articles_from_cats){

next if ($name =~ /philosophers?$/i); # this is not a person, rather a list, or term
next if ($name =~ /^List of/i); # this is not a person, rather a term
next if ($name =~ /^Contributors/i); # this is not a person, rather a list


$text = &read_from_disk_or_wikipedia($Editor, $name);

# get DOB, country, etc
($country, $birth, $death) = &parse_get_data($text, $country2nationality);
$last = &get_last( $name ); # for a given name, try to guess the first name and the last name
$entries->{$name}="* \[\[$name|$last\]\] \($country, $birth - $death\)"; # put in a hash

}
}

sub strip_last {
my $last=shift;

if ($last =~ /\|(.*?)$/){
$last=$1;
}

# this is needed to sort things well
$last =~ s/^[a-z]+ //g; # rm word not starting with upper case (like "de Vito", sort by V and not d)
$last =~ s/^[a-z]+ //g; # this will work for van der Waerden
$last =~ s/^[a-z]+ //g; # one more time just in case
$last =~ s/^Le //g; # for some French philosophers
$last =~ s/^Al[\- ]//ig; # for some French philosophers

$last = &strip_accents_and_stuff($last); # and strip accents

return $last;
}

sub put_ndash {

my $a=shift; my $b=shift;
$b =~ s/-/\x{2013}/g;
return "$a$b";
}


sub reconcile {
my ($old, $oname, $ocountry, $obirth, $odeath, $new, $nname, $ncountry, $nbirth, $ndeath);
($old, $new)=@_;

# from new, take the country, birth, death info
if ($new =~ /^\s*\*\s*\[\[(.*?)\]\]\s*\((.*?),(.*?)-(.*?)\)/){
$nname=$1; $ncountry=$2; $nbirth=$3; $ndeath=$4;
$ncountry =~ s/\s*$//g; $ncountry =~ s/^\s*//g;
$nbirth =~ s/\s*$//g; $nbirth =~ s/^\s*//g;
$ndeath =~ s/\s*$//g; $ndeath =~ s/^\s*//g;
}else{
$nname = "";
$ncountry = "?"; $nbirth = "?"; $ndeath = "";
}

# old country
$ocountry="?"; $obirth="?"; $odeath="";
if ($old =~ /^\s*\*\s*\[\[(.*?)\]\].*?\((.*?),(.*?)-(.*?)\)/){ $odeath=$4;   }
if ($old =~ /^\s*\*\s*\[\[(.*?)\]\].*?\((.*?),(.*?)[-\)]/   ){ $obirth=$3;   }
if ($old =~ /^\s*\*\s*\[\[(.*?)\]\].*?\((.*?)[\),\d]/       ){ $ocountry=$2; }
if ($old =~ /^\s*\*\s*\[\[(.*?)\]\]/                        ){ $oname=$1;    }

# strip space
$ocountry =~ s/\s*$//g;   $ocountry =~ s/^\s*//g;
$obirth   =~ s/\s*$//g;   $obirth   =~ s/^\s*//g;
$odeath   =~ s/\s*$//g;   $odeath   =~ s/^\s*//g;

# Always keep the the name from the old. For the rest, keep the entry with most info.
$nname    = $oname     if (                      $oname    =~ /\w/    );
$ncountry = $ocountry  if ($ncountry  !~ /\w/ && $ocountry =~ /\w/    );
$nbirth   = $obirth    if ($nbirth    !~ /\d/ && $obirth   =~ /[^\?]/ );
$ndeath   = $odeath    if ($ndeath    !~ /\d/ && $odeath   =~ /[^\?]/ );
return "* [[$nname]] \($ncountry, $nbirth - $ndeath\)";	
}

sub parse_get_data {

my ($country, $birth, $death, @countries, %duplication_tracker, $text, $country2nationality);

$text=$_[0]; $country2nationality=$_[1];

$text =~ s/\[\[Category\s*:\s*Ancient\s+philosophers[^\]\[]*?\]\]//ig;
$text =~ s/\[\[Category\s*:[^\]]*?century[^\]\[]*?\]\]//ig;
$text =~ s/\[\[Category\s*:[^\]]*?women[^\]\[]*?\]\]//ig;
$text =~ s/\[\[Category\s*:\s*Philosophers[^\]\[]*?\]\]//ig;

@countries=();
@countries = (@countries, ($text =~ /\[\[Category:\s*([a-z ]*?)\s+philosophers/ig));

$country="";
foreach (@countries){
$_ = lc ($_);
if ( exists $country2nationality->{$_}){
$_=$country2nationality->{$_};
}

if  ( ! exists $duplication_tracker{$_} && $_ !~ /^\s*$/) {
$country = "$country" . "$_" . "/";
$duplication_tracker{$_}=1;
}
}
$country =~ s/^\s*\/*\s*//;
$country =~ s/\s*\/*\s*$//;
$country = "\?" if ($country =~ /^\s*$/);

$birth="\?"; # a person must have a date of birth
if ($text =~ /\[\[Category:\s*([^\]\[]*?)\s+births/i){
$birth=$1;
}

$death=""; # have some respect, the person might still be alive
if ($text =~ /\[\[Category:\s*([^\]\[]*?)\s+deaths/i){
$death=$1;
}

if ($text =~ /\{\{lived\s*\|\s*b=(.*?)\s*\|\s*d=(.*?)\s*[\}\|]/i) {
$birth=$1; $death=$2;
}elsif  ($text =~ /\{\{lived\s*\|\s*b=(.*?)\s*[\|\}]/i){
$birth=$1;
}

return ($country, $birth, $death);
}