measure.pl
# This work is licensed under the Creative Commons
# Attribution-ShareAlike 3.0 Unported License. To view a copy of this
# license, visit http://creativecommons.org/licenses/by-sa/3.0/ or send
# a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042,
# USA.
# Reference: http://www.lombardinetworks.net
# (C) Robert Tolksdorf, http://www.robert-tolksdorf.de
#use warnings;
use Quantum::Superpositions;
use Switch;
use Data::Dumper;
use Graph::Directed;
use Getopt::Std;
#use Graph::Reader::XML;
require "/vhome/lombardinetworks.net/htdocs/bin/helper.pl";
sub normalize_year {
# is it two-digits only?
if (($_[0]<100) && ($_[0]>=0)) {
# is it 2000+?
if ($_[0]<10) {
return $_[0]+2000;
} else {
return $_[0]+1900;
}
} else {
# already normalized
return $_[0];
}
}
# get options, -v emits the content of a .var, if applicable
my %options=();
getopts("v", \%options);
&loadWorkIds();
&load_aliases();
switch ($ARGV[0]) {
#---------------------------------------------
case "NodesEdges" {
print "Work, Nodes, Edges, Edges per Node\n";
# Iterate all xgmml files
foreach $key (keys %workid) {
$line=&readNetworkAsLine($workid{$key});
$nodes= () = $line =~ /<node/g;
$edges= () = $line =~ /<edge/g;
print "\"$key\", $nodes, $edges, ".$edges/$nodes."\n";
}
}
#---------------------------------------------
case "NodeTypes" {
print "Work, Nodes, Institution, MergeInstitution, Person, Year, YearFinal, FinalInfo, Person per Institution\n";
# Iterate all xgmml files
foreach $key (keys %workid) {
$line=&read_network_as_line($workid{$key});
$nodes= () = $line =~ /<node/g;
$edges= () = $line =~ /<edge/g;
$institution = () = $line =~ /lombardi.owl#Institution/g;
$mergedinstitution = () = $line =~ /lombardi.owl#MergedInstitution/g;
$person = () = $line =~ /lombardi.owl#Person/g;
$year = () = $line =~ /lombardi.owl#Year/g;
$yearfinal = () = $line =~ /lombardi.owl#YearFinal/g;
$finalinfo = () = $line =~ /lombardi.owl#FinalInfo/g;
print "\"$key\", $nodes, $institution, $mergedinstitution, $person, $year, $yearfinal, $finalinfo, ".$person/$institution."\n";
}
}
#---------------------------------------------
case "EdgeTypes" {
print "Work, Edges, InfluenceControl, Association, FinancialTransaction, BlockedFailedTransaction, SaleProperty, SaleTransfer, ".
"FinancialAssociation, Final, BlockedFailed, Connection, YearArrow, FinancialConnection, SingleNearby\n";
# Iterate all xgmml files
foreach $key (keys %workid) {
$line=&read_network_as_line($workid{$key});
$nodes= () = $line =~ /<node/g;
$edges= () = $line =~ /<edge/g;
$influencecontrol = () = $line =~ /lombardi.owl#InfluenceControl/g;
$association = () = $line =~ /lombardi.owl#Association/g;
$financialtransaction = () = $line =~ /lombardi.owl#FinancialTransaction/g;
$blockedfailedtransaction = () = $line =~ /lombardi.owl#BlockedFailedTransaction/g;
$saleproperty = () = $line =~ /lombardi.owl#SaleProperty/g;
$saletransfer = () = $line =~ /lombardi.owl#SaleTransfer/g;
$financialassociation = () = $line =~ /lombardi.owl#FinancialAssociation/g;
$final = () = $line =~ /lombardi.owl#Final/g;
$blockedfailed = () = $line =~ /lombardi.owl#BlockedFailed/g;
$connection = () = $line =~ /lombardi.owl#Connection/g;
$yeararrow = () = $line =~ /lombardi.owl#YearArrow/g;
$yearline = () = $line =~ /lombardi.owl#YearLine/g;
$financialconnection = () = $line =~ /lombardi.owl#FinancialConnection/g;
$singlenearby = () = $line =~ /lombardi.owl#SingleNearby/g;
print "\"$key\", $edges, $influencecontrol, $association, $financialtransaction, $blockedfailedtransaction, ".
"$saleproperty, $saletransfer, $financialassociation, $final, $blockedfailed,".
"$connection,$yeararrow, $yearline, $financialconnection, $singlenearby\n";
}
}
#---------------------------------------------
case "GraphStats" {
print ("Work; No. Nodes; No. Edges; Av. Degree; Radius\n");
# Iterate all xgmml files
my @keys = sort { $a cmp $b } keys %workid;
foreach my $key ( @keys ) {
$g=&read_network_as_graph($workid{$key});
$ug=$g->undirected_copy_graph;
if ($ug->is_connected) {
$radius_text=$ug->radius;
} else {
$radius_text="--";
}
printf "$key; %u; %u; %.2f; $radius_text\n",scalar $g->vertices, scalar $g->edges, $ug->average_degree, $radius_text ; # $g-diameter;
}
}
#---------------------------------------------
case "Blankness" {
print ("Work; No. Nodes; No. blank Nodes; Percentage\n");
# Iterate all xgmml files
my @keys = sort { $a cmp $b } keys %workid;
foreach my $key ( @keys ) {
$g=&read_network_as_graph($workid{$key});
$line=&read_network_as_line($workid{$key});
my $blank_nodes = () = $line =~ /\[\d*\]/g; # Count [2] Put matches into a list and then take the scalar
$blank_nodes += () = $line =~ /\(\d*\)/g; # Count (2) Put matches into a list and then take the scalar
printf "$key; %u; %u; %.3f\n",scalar $g->vertices, $blank_nodes, $blank_nodes/scalar $g->vertices;
}
}
#---------------------------------------------
case "NodeIntersect" {
# Iterate all xgmml files
foreach $key (keys %workid) {
my @labelList = ();
open(F, "<", &file_of_network($workid{$key}));
while ($line=<F>) {
# do we read a node?
if ($line=~/^\s*\<node id=\".*\"/) {
$line2="";
$tolist=0;
# as long as we do not read the closing node tag...
while (!($line2=~/^\s*\<\/node/)) {
# get line
$line2=<F>;
# if we get the name attribute
if ($line2=~/\s*<att name=\"name\" value=\"(.*)\" type/) {
$label=$1;
# ignore [x], (x), ? ...
if (&to_consider($key)) {
# decode if necessary
$label=~s/\&/\&/g;
$label=~s/\"/\"/g;
# apply alias
if (exists($alias{$label})) {
$label=$alias{$label};
}
# append the name we found
push @labelList, $label;
}
}
}
}
}
close(F);
# get reference to array of node labels for network
# store as {workid} reference to [labels]
$labels{$workid{$key}}=\@labelList; # store a reference to the array
}
# Iterate though all networks
for (my $i=1000; $i<1000+ keys %labels; $i++) {
# Iterate through all following
for (my $j=$i+1; $j<1000+ keys %labels; $j++) {
# %count elements will be increased for each label from both networks. If ==2 > overlap
my %count = ();
$overlap=0;
# get refs to label arrays
my $a=$labels{$i}; # $a is the ref!
my $b=$labels{$j};
# Iterate through both and increase count for label
foreach (@{$a}) {
$count{$_}++;
}
foreach (@{$b}) {
$count{$_}++;
}
# look at result if count for label=1, only in one network, if 2 in both -> overlap
foreach $e (keys %count) {
if ($count{$e}==2) {
$overlap++;
}
}
# $overlap now is number of labels that appear in both
# output some statistics
if ($overlap>0) {
# print "Networks $i and $j overlap in $overlap nodes";
# print "They are '".$idwork{$i}."' and '".$idwork{$j}."'\n";
# printf (" (%.3f%% of nodes in $i, %.3f%% of nodes in $j\n",100*$overlap/scalar @{$a},100*$overlap/scalar @{$b});
# percentage of all nodes in $i that overlap with nodes in $j
# if =100% -> $i is a subgraph of $j
# if [$i][$j high and [$j][$i] is low -> $i contributes to $j, but other also do
# if [$i][$j high and [$j][$i] is high -> $i is a predecessor of $j
$overlap_matrix[$i][$j]=$overlap/scalar @{$a};
# percentage of all nodes in $j that overlap with nodes in $i
$overlap_matrix[$j][$i]=$overlap/scalar @{$b};
} else {
$overlap_matrix[$i][$j]=0;
$overlap_matrix[$j][$i]=0;
}
$overlapTotal+=$overlap;
}
}
if (defined $options{v}) {
print Dumper (\@overlap_matrix);
} else {
print "$overlapTotal overlaps in total\n";
# Iterate though all networks
for (my $i=1000; $i<1000+ keys %labels; $i++) {
print ";".$idwork{$i}; # this leaves the header for the first column empty
}
print "\n";
# Iterate though all networks
for (my $i=1000; $i<1000+ keys %labels; $i++) {
print $idwork{$i};
for (my $j=1000; $j<1000+ keys %labels; $j++) {
printf(";%.3f",$overlap_matrix[$i][$j]);
}
print "\n";
}
# emit one long vector for excels histogram
# for (my $i=1000; $i<1000+ keys %labels; $i++) {
# # Iterate through all following
# for (my $j=$i+1; $j<1000+ keys %labels; $j++) {
# printf("%.3f\n",$overlap_matrix[$i][$j]);
# }
# }
}
}
#---------------------------------------------
case "YearRange" {
# Iterate all xgmml files
foreach $id (sort values %workid) {
open(F, "<", &file_of_network($id));
# init with extremes
$max_year=-1;
$min_year=9999;
while ($line=<F>) {
# if it starts a node then we extract a new year
if ($line=~/<node/) {
# so far, no year in this node
$year_found=0;
}
# if it is an attribute that contains the Year-URI
if ($line=~/\s*<att .*\"http:\/\/www.lombardinetworks.net\/lombardi.owl\#Year\"/) {
# we got a year node
$year_found=1;
}
if ($line=~/<att name=\"name\" value=\"(.*?)\"/) {
# store the value, even if it later turns out as not a year
$year=$1;
}
if ($line=~/<\/node/) {
if ($year_found) {
# change nodes like 82/1 (would fail for to_consider otherwise
if ($year=~/(.*)\//) {
$year=$1;
}
# if not empty (TODO: why?) not blank, process it
if (($year ne "") && (&to_consider($year))) {
# change things like 1986
if ($year=~/19(\d\d)/) {
$year=$1;
}
# change things like April 85
if ($year=~/\D*(\d\d)\D*/) {
$year=$1;
}
# do we have a new max?
if ($year>$max_year) {
$max_year=$year;
}
# do we have a new min?
if ($year<$min_year) {
$min_year=$year;
}
}
}
}
}
close(F);
# now also look at the title of the work
$max_title_year=-1;
$min_title_year=9999;
if ($idwork{$id} =~ /.*(\d{2,4})-(\d{2,4}).*/) {
$min_title_year=&normalize_year($1);
$max_title_year=&normalize_year($2);
} else {
# deal with the pattern`'1981-6'
if ($idwork{$id} =~ /.*(\d{4})-(\d{1}).*/) {
$min_title_year=$1; # alread four digits
$max_title_year=substr($1,0,3).$2; # alread four digits
} else { # not sure whether there are work with eg. 84-8 in the title, but to make sure... UNTESTED!
if ($idwork{$id} =~ /.*(\d{2})-(\d{1}).*/) {
$min_title_year=&normalize_year($1);
$max_title_year=&normalize_year(substr($1,0,1).$2);
}
}
}
if ($min_year!=9999) {
$min_year=&normalize_year($min_year);
}
if ($max_year!=-1) {
$max_year=&normalize_year($max_year);
}
# take which one is earlier/later
if ($min_title_year<$min_year) {
$min_year=$min_title_year;
}
if ($max_title_year>$max_year) {
$max_year=$max_title_year;
}
# Generate a string "min-max"
if ($min_year != 9999) {
$year_range{$id}= $min_year."-";
}
if ($max_year!=-1) {
# add "-" if no min_year
$year_range{$id}.=$min_year != 9999 ?"":"-";
$year_range{$id}.=$max_year;
}
}
print Dumper(\%year_range);
}
else {
print "measure NodeEdges|NoteTypes|EdgeTypes|GraphStats|NodeIntersect|Blankness|YearRange\n";
}
}