measure.pl


# This work is licensed under the Creative Commons
# Attribution-ShareAlike 3.0 Unported License. To view a copy of this
# license, visit http://creativecommons.org/licenses/by-sa/3.0/ or send
# a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042,
# USA. 
# Reference: http://www.lombardinetworks.net
# (C) Robert Tolksdorf, http://www.robert-tolksdorf.de
#use warnings;
use Quantum::Superpositions;
use Switch;
use Data::Dumper;
use Graph::Directed;
use Getopt::Std;
#use Graph::Reader::XML;

require "/vhome/lombardinetworks.net/htdocs/bin/helper.pl";

sub normalize_year {
    # is it two-digits only?
    if (($_[0]<100) && ($_[0]>=0)) {
	# is it 2000+?
	if ($_[0]<10) {
	    return $_[0]+2000;
	} else {
	    return $_[0]+1900;
	}
    } else {
	# already normalized
	return $_[0];
    }
}

# get options, -v emits the content of a .var, if applicable
my %options=();
getopts("v", \%options);

&loadWorkIds();
&load_aliases();

switch ($ARGV[0]) {
#---------------------------------------------
    case "NodesEdges" {
	print "Work, Nodes, Edges, Edges per Node\n";
# Iterate all xgmml files
	foreach $key (keys %workid) {
	    $line=&readNetworkAsLine($workid{$key});
	    $nodes= () = $line =~ /<node/g;
	    $edges= () = $line =~ /<edge/g;
	    print "\"$key\", $nodes, $edges, ".$edges/$nodes."\n";
	}
    }
#---------------------------------------------
    case "NodeTypes" {
	print "Work, Nodes, Institution, MergeInstitution, Person, Year, YearFinal, FinalInfo, Person per Institution\n";
 # Iterate all xgmml files
	foreach $key (keys %workid) {
	    $line=&read_network_as_line($workid{$key});
	    $nodes= () = $line =~ /<node/g;
	    $edges= () = $line =~ /<edge/g;
	    $institution = () = $line =~ /lombardi.owl#Institution/g;
	    $mergedinstitution = () = $line =~ /lombardi.owl#MergedInstitution/g;
	    $person = () = $line =~ /lombardi.owl#Person/g;
	    $year = () = $line =~ /lombardi.owl#Year/g;
	    $yearfinal = () = $line =~ /lombardi.owl#YearFinal/g;
	    $finalinfo = () = $line =~ /lombardi.owl#FinalInfo/g;
	    print "\"$key\", $nodes, $institution, $mergedinstitution, $person, $year, $yearfinal, $finalinfo, ".$person/$institution."\n";
	}
    }
#---------------------------------------------
    case "EdgeTypes" {
	print "Work, Edges, InfluenceControl, Association, FinancialTransaction, BlockedFailedTransaction, SaleProperty, SaleTransfer, ".
	    "FinancialAssociation, Final, BlockedFailed, Connection, YearArrow, FinancialConnection, SingleNearby\n";
        # Iterate all xgmml files
	foreach $key (keys %workid) {
	    $line=&read_network_as_line($workid{$key});
	    $nodes= () = $line =~ /<node/g;
	    $edges= () = $line =~ /<edge/g;
	    $influencecontrol = () = $line =~ /lombardi.owl#InfluenceControl/g;
	    $association = () = $line =~ /lombardi.owl#Association/g;
	    $financialtransaction = () = $line =~ /lombardi.owl#FinancialTransaction/g;
	    $blockedfailedtransaction = () = $line =~ /lombardi.owl#BlockedFailedTransaction/g;
	    $saleproperty = () = $line =~ /lombardi.owl#SaleProperty/g;
	    $saletransfer = () = $line =~ /lombardi.owl#SaleTransfer/g;
	    $financialassociation = () = $line =~ /lombardi.owl#FinancialAssociation/g;
	    $final = () = $line =~ /lombardi.owl#Final/g;
	    $blockedfailed = () = $line =~ /lombardi.owl#BlockedFailed/g;
	    $connection = () = $line =~ /lombardi.owl#Connection/g;
	    $yeararrow = () = $line =~ /lombardi.owl#YearArrow/g;
	    $yearline = () = $line =~ /lombardi.owl#YearLine/g;
	    $financialconnection = () = $line =~ /lombardi.owl#FinancialConnection/g;
	    $singlenearby = () = $line =~ /lombardi.owl#SingleNearby/g;

	    print "\"$key\", $edges, $influencecontrol, $association, $financialtransaction, $blockedfailedtransaction, ".
		"$saleproperty, $saletransfer, $financialassociation, $final, $blockedfailed,".
		"$connection,$yeararrow, $yearline, $financialconnection, $singlenearby\n";
	}
    }
#---------------------------------------------
    case "GraphStats" {
	print ("Work; No. Nodes; No. Edges; Av. Degree; Radius\n");
	# Iterate all xgmml files
	my @keys = sort { $a cmp $b } keys %workid;
	foreach my $key ( @keys ) {
	    $g=&read_network_as_graph($workid{$key});
	    $ug=$g->undirected_copy_graph;
	    if ($ug->is_connected) {
		$radius_text=$ug->radius;
	    } else {
		$radius_text="--";
	    }
	    printf "$key; %u; %u; %.2f; $radius_text\n",scalar $g->vertices, scalar $g->edges, $ug->average_degree, $radius_text ; # $g-diameter;


	}
    }
#---------------------------------------------
    case "Blankness" {
	print ("Work; No. Nodes; No. blank Nodes; Percentage\n");
	# Iterate all xgmml files
	my @keys = sort { $a cmp $b } keys %workid;
	foreach my $key ( @keys ) {
	    $g=&read_network_as_graph($workid{$key});
	    $line=&read_network_as_line($workid{$key});
	    my $blank_nodes = () = $line =~ /\[\d*\]/g;	    # Count [2] Put matches into a list and then take the scalar
	    $blank_nodes += () = $line =~ /\(\d*\)/g;	    # Count (2) Put matches into a list and then take the scalar
	    printf "$key; %u; %u; %.3f\n",scalar $g->vertices, $blank_nodes, $blank_nodes/scalar $g->vertices;
	}
    }
#---------------------------------------------
    case "NodeIntersect" {
        # Iterate all xgmml files
	foreach $key (keys %workid) {
	    my @labelList = ();
	    open(F, "<", &file_of_network($workid{$key}));
	    while ($line=<F>) {
		# do we read a node?
		if ($line=~/^\s*\<node id=\".*\"/) {
		    $line2="";
		    $tolist=0;
		    # as long as we do not read the closing node tag...
		    while (!($line2=~/^\s*\<\/node/)) {
			# get line
			$line2=<F>;
			# if we get the name attribute
			if ($line2=~/\s*<att name=\"name\" value=\"(.*)\" type/) {
			    $label=$1;
			    # ignore [x], (x), ? ...
			    if (&to_consider($key)) {
				# decode if necessary
				$label=~s/\&amp;/\&/g;
				$label=~s/\&quot;/\"/g;
				# apply alias
				if (exists($alias{$label})) {
				    $label=$alias{$label};
				}
				# append the name we found
				push @labelList, $label;
			    }
			}
		    }
		}
	    }
	    close(F);
	    # get reference to array of node labels for network
	    # store as {workid} reference to [labels]
	    $labels{$workid{$key}}=\@labelList; # store a reference to the array
	}
	# Iterate though all networks
	for (my $i=1000; $i<1000+ keys %labels; $i++) {
	    # Iterate through all following
	    for (my $j=$i+1; $j<1000+ keys %labels; $j++) {
		# %count elements will be increased for each label from both networks. If ==2 > overlap
		my %count = ();
		$overlap=0;
		# get refs to label arrays
		my $a=$labels{$i}; # $a is the ref!
		my $b=$labels{$j};
		# Iterate through both and increase count for label
		foreach (@{$a}) {
		    $count{$_}++;
		}
		foreach (@{$b}) {
		    $count{$_}++;
		}
		# look at result if count for label=1, only in one network, if 2 in both -> overlap
		foreach $e (keys %count) {
		    if ($count{$e}==2) {
			$overlap++;
			
		    }
		}
		# $overlap now is number of labels that appear in both
		# output some statistics
		if ($overlap>0) {
#		    print "Networks $i and $j overlap in $overlap nodes";
#		    print "They are '".$idwork{$i}."' and '".$idwork{$j}."'\n";
#		    printf (" (%.3f%% of nodes in $i, %.3f%% of nodes in $j\n",100*$overlap/scalar @{$a},100*$overlap/scalar @{$b});
		    # percentage of all nodes in $i that overlap with nodes in $j
		    # if =100% -> $i is a subgraph of $j
		    # if [$i][$j high and [$j][$i] is low -> $i contributes to $j, but other also do
		    # if [$i][$j high and [$j][$i] is high -> $i is a predecessor of $j
		    $overlap_matrix[$i][$j]=$overlap/scalar @{$a};
		    # percentage of all nodes in $j that overlap with nodes in $i
		    $overlap_matrix[$j][$i]=$overlap/scalar @{$b};
		} else {
		    $overlap_matrix[$i][$j]=0;
		    $overlap_matrix[$j][$i]=0;
		}	    
		
		$overlapTotal+=$overlap;
	    }
	}
	if (defined $options{v}) {
	    print Dumper (\@overlap_matrix);
	} else {
	    print "$overlapTotal overlaps in total\n";
	    # Iterate though all networks
	    for (my $i=1000; $i<1000+ keys %labels; $i++) {
		print ";".$idwork{$i}; # this leaves the header for the first column empty
	    }
	    print "\n";
	    # Iterate though all networks
	    for (my $i=1000; $i<1000+ keys %labels; $i++) {
		print $idwork{$i};
		for (my $j=1000; $j<1000+ keys %labels; $j++) {
		    printf(";%.3f",$overlap_matrix[$i][$j]);
		}
		print "\n";
	    }
	    # emit one long vector for excels histogram
	    # for (my $i=1000; $i<1000+ keys %labels; $i++) {
	    #     # Iterate through all following
	    #     for (my $j=$i+1; $j<1000+ keys %labels; $j++) {
	    # 	printf("%.3f\n",$overlap_matrix[$i][$j]);
	    #     }
	    # }
	}
    }
#---------------------------------------------
    case "YearRange" {
        # Iterate all xgmml files
	foreach $id (sort values %workid) {
	    open(F, "<", &file_of_network($id));
	    # init with extremes
	    $max_year=-1;
	    $min_year=9999;
	    while ($line=<F>) {
		# if it starts a node then we extract a new year
		if ($line=~/<node/) {
		    # so far, no year in this node
		    $year_found=0;
		}
		# if it is an attribute that contains the Year-URI
		if ($line=~/\s*<att .*\"http:\/\/www.lombardinetworks.net\/lombardi.owl\#Year\"/) {
		    # we got a year node
		    $year_found=1;
		}
		if ($line=~/<att name=\"name\" value=\"(.*?)\"/) {
		    # store the value, even if it later turns out as not a year
		    $year=$1;
		}
		if ($line=~/<\/node/) {
		    if ($year_found) {
			# change nodes like 82/1 (would fail for to_consider otherwise
			if ($year=~/(.*)\//) {
			    $year=$1;
			}
			# if not empty (TODO: why?) not blank, process it
			if (($year ne "") && (&to_consider($year))) {
			    # change things like 1986
			    if ($year=~/19(\d\d)/) {
				$year=$1;
			    }
			    # change things like April 85
			    if ($year=~/\D*(\d\d)\D*/) {
				$year=$1;
			    }			    
			    # do we have a new max?
			    if ($year>$max_year) {
				$max_year=$year;
			    }
			    # do we have a new min?
			    if ($year<$min_year) {
				$min_year=$year;
			    }
			}
		    }
		}
	    }
	    close(F);
	    # now also look at the title of the work
	    $max_title_year=-1;
	    $min_title_year=9999;
	    if ($idwork{$id} =~ /.*(\d{2,4})-(\d{2,4}).*/) {
		$min_title_year=&normalize_year($1);
		$max_title_year=&normalize_year($2);
	    } else {
		# deal with the pattern`'1981-6'
		if ($idwork{$id} =~ /.*(\d{4})-(\d{1}).*/) {
		    $min_title_year=$1; # alread four digits
		    $max_title_year=substr($1,0,3).$2; # alread four digits
		} else { # not sure whether there are work with eg. 84-8 in the title, but to make sure... UNTESTED!
		    if ($idwork{$id} =~ /.*(\d{2})-(\d{1}).*/) {
			$min_title_year=&normalize_year($1);
			$max_title_year=&normalize_year(substr($1,0,1).$2);
		    }
		}
	    }
	    if ($min_year!=9999) {
		$min_year=&normalize_year($min_year);
	    }
	    if ($max_year!=-1) {
		$max_year=&normalize_year($max_year);
	    }
	    # take which one is earlier/later
	    if ($min_title_year<$min_year) {
		$min_year=$min_title_year;
	    }
	    if ($max_title_year>$max_year) {
		$max_year=$max_title_year;
	    }
	    # Generate a string "min-max"
	    if ($min_year != 9999) {
		$year_range{$id}= $min_year."-";
	    }
	    if ($max_year!=-1) {
		# add "-" if no min_year
		$year_range{$id}.=$min_year != 9999 ?"":"-";
		$year_range{$id}.=$max_year;
	    }
	}
	
	print Dumper(\%year_range);

    }
    else {
	print "measure NodeEdges|NoteTypes|EdgeTypes|GraphStats|NodeIntersect|Blankness|YearRange\n";
    }
}