#!/usr/bin/perl -w ############################################################### # Network Topology calculation # # # # Haiyuan Yu && Xiaowei zhu # # Input: 1. Dataset # # 2. Group data vector # # Output: One table including the average links, diameter # # and clustering coefficient for each group # # Format: ./topnet.txt vector inter_dataset # ############################################################### use strict; my $final_report="./net_report.txt"; my $FINAL; (open(FINAL, ">$final_report")) || die "cannot open the final file"; my @result; # the data about the average links my $groupnum; # the number of groups in the submitted file my (@diameter,@cc); my $i=0; print "start\n"; # main process #======================= #1.average links ($groupnum, @result)=&find_links(); # get the number of groups # $result[i][0]=group name # $result[i][1]=ave links # $result[i][2]=standard error # $result[i][3]=numbre of elements print "1 ok\n"; # prepare for the clustering coffiecient calculation: make a table of cc for each orf #2.table_cc &table_cc(); print "2 ok\n"; #3.clustering coeffiecient @cc=&find_cc(); # get the clustering coeffiecient for each group # $cc[i][0]=group name # $cc[i][1]=cluster coeffiecient # $cc[i][2]=standard error print "3 ok\n"; # prepare for the calculation fo diameter: make a table of distances between every orf pair. #4.table_diameter &table_diameter(); print "4 ok\n"; #5.diameter @diameter=&find_diameter(); # get the diameter and its standard error for each group # $diameter[i][0]=group name # $diameter[i][1]=diameter(max) # $diameter[i][2]=standard error # $diameter[i][3]=average distances print "5 ok\n"; # print final_report #=================================== print FINAL "Group Name\tNumber of Elements\tAverage Links\t Standard error\tDiameter\tAverage distance\tStandard error\t Clustering Coeffiecient\tStandard error\n"; for($i=0; $i<$groupnum; $i++) { print FINAL "$result[$i][0]\t$result[$i][3]\t$result[$i][1]\t $result[$i][2]\t$diameter[$i][1]\t$diameter[$i][3]\t$diameter[$i][2]\t $cc[$i][1]\t$cc[$i][2]\n"; } print "All done!\tThe report file is './net_report.txt'\n"; exit; #======================= sub find_links { # To calculate the average number of links for different groups of proteins within cell; my $vector_file=$ARGV[0]; my $dataset_file=$ARGV[1]; my ($DATA, $INPUT, $OUT); my @out; my ($aa, $bb, $cc); my $group; my @temp; my $gcount=0; my $N; my ($std,$tt); my $total; my (%NLinks, %prots, %num); my ($out_x, $out_y); my @regress; my ($mdata, $bdata, $corr); # Read in interaction information open (DATA, "<$dataset_file") || die "Unable to open the input file"; my %read; my $lines = 0; while () { chomp; ($aa, $bb) = split; $aa =~ s/\-//g; $aa =~ tr/[a-z]/[A-Z]/; $bb =~ s/\-//g; $bb =~ tr/[a-z]/[A-Z]/; if (!exists $read{$aa}{$bb}) { $NLinks{$aa} = 0 if (!exists $NLinks{$aa}); $NLinks{$bb} = 0 if (!exists $NLinks{$bb}); $NLinks{$aa} ++; $NLinks{$bb} ++; $read{$aa}{$bb} = $read{$bb}{$aa} = 1; $lines ++ ; } } close (DATA); # Read input data ; open (INPUT, "<$vector_file") || die "Unable to open the input file"; while () { chomp; ($group, @temp) = split (/\t/); $group =~ s/\s+/\-/g; $group =~ s/\//%/g; %prots = (); for $aa (@temp) { $aa =~ s/\-//g; $aa =~ tr/[a-z]/[A-Z]/; $prots{$aa} = 1; } # Calculate average number of links; $total = 0; $N = 0; %num = (); for $aa (keys %prots) { if (exists $NLinks{$aa}) { $total += $NLinks{$aa}; $N ++; $num{$NLinks{$aa}} = 0 if (!exists $num{$NLinks{$aa}}); $num{$NLinks{$aa}} ++; } } $out[$gcount][0] = $group; # the name of this group $out[$gcount][1] = $total / $N; # the average links of this group $out[$gcount][3] = $N; # the number of elements in this group # calculate the standard error $std = 0; for $aa (keys %prots) { if (exists $NLinks{$aa}) { $tt = $NLinks{$aa} - $out[$gcount][1]; $std += ($tt * $tt); } } $std = $std / $N; $std = sqrt ($std); # standard deviation $std/= sqrt($N); # standard error $out[$gcount][2]=$std; # the standard error of ave_links $gcount++; } close (INPUT); return ($gcount, @out); } #find_diameter # To calculate diameters and averaged distances for different groups of proteins within cell; sub find_diameter { my $vector_file=$ARGV[0]; my $dataset_file=$ARGV[1]; my ($INPUT, $DATA); my $group; my @temp; my (%total, %N); my ($aa,$bb); my $dist; my $tt; my (%vector,%ave, %max, %std); my $i; my @report; my $gnum=0; my $std; my @groupname; # Read input data ; open (INPUT, "<./$vector_file") || die "Unable to open the input file"; while () { chomp; ($group, @temp) = split (/\t/); $group =~ s/\s+/\-/g; $groupname[$i++]=$group; $total{$group} = $N{$group} = $max{$group} = 0; for $i (0..$#temp) { $aa = $temp[$i]; $aa =~ s/\-//g; $aa =~ tr/[a-z]/[A-Z]/; $vector{$group}{$aa} = 1; } } $gnum=$i; close (INPUT); # Read in interaction information open (DATA, "./Net_Distance_all.txt") || die "Unable to open the input file"; while () { chomp; ($aa, $bb, $dist) = split; $aa =~ s/\-//g; $aa =~ tr/[a-z]/[A-Z]/; $bb =~ s/\-//g; $bb =~ tr/[a-z]/[A-Z]/; for $group (keys %vector) { if ((exists $vector{$group}{$aa}) && (exists $vector{$group}{$bb})) { $total{$group} += $dist; $N{$group} ++; $max{$group} = $dist if ($max{$group} < $dist); last; } } } close (DATA); # Calculate the standard error; open (DATA, "<./Net_Distance_all.txt") || die "Unable to open the input file"; while () { chomp; ($aa, $bb, $dist) = split; $aa =~ s/\-//g; $aa =~ tr/[a-z]/[A-Z]/; $bb =~ s/\-//g; $bb =~ tr/[a-z]/[A-Z]/; for $group (keys %vector) { if ((exists $vector{$group}{$aa}) && (exists $vector{$group}{$bb})) { $ave{$group} = $total{$group} / $N{$group} if (!exists $ave{$group}); $tt = $ave{$group} - $dist; $std{$group} += ($tt * $tt); last; } } } close (DATA); for($i=0; $i<$gnum; $i++) { $group = $groupname[$i]; if ($N{$group} != 0) { $std = sqrt($std{$group} / ($N{$group} * $N{$group})); } else { $std=-1; } $report[$i][0] = $group; $report[$i][1] = $max{$group}; $report[$i][2] = $std; $report[$i][3] = $ave{$group}; } return (@report); } # To calculate the average CC for different groups of proteins within cell; # Read in interaction information #====================================================== sub find_cc { my $vector_file=$ARGV[0]; my $dataset_file=$ARGV[1]; my @report; my @temp; my %cc; my $total; my $N; my $ave; my $aa; my %prots; my $group; my $DATA; my $i=0; my $INPUT; my $std; my $se; my $tt; open (DATA, "./Net_Output_CC.txt") || die "Unable to open the input file"; while () { chomp; @temp = split; $temp[0] =~ s/\-//g; $temp[0] =~ tr/[a-z]/[A-Z]/; $cc{$temp[0]} = $temp[2]; } close (DATA); # Read input data ; open (INPUT,"<./$vector_file") || die "Unable to open the input file"; while () { chomp; ($group, @temp) = split (/\t/); $group =~ s/\s+/\-/g; %prots = (); for $aa (@temp) { $aa =~ s/\-//g; $aa =~ tr/[a-z]/[A-Z]/; $prots{$aa} = 1; } # Calculate average number of links; $total = 0; $N = 0; for $aa (keys %prots) { if (exists $cc{$aa}) { $total += $cc{$aa}; $N ++; } } if ($N != 0) { $ave = $total / $N; $report[$i][0]=$group; $report[$i][1]=$ave; # Calculate the standard diviation; $std = 0; for $aa (keys %prots) { if (exists $cc{$aa}) { $tt = $cc{$aa} - $ave; $std += ($tt * $tt); } } $std = $std / $N; $se = sqrt ($std/$N); $report[$i++][2]=$se; } } return (@report); close (INPUT); } sub table_cc { # Calculate the clustering coefficient of each individual protein within the interaction networks; # Suppose a vertex has k neighbours; then at most k(k-1)/2 edges can exist between them; C denote the fraction of these allowable edges that actually exist; # CC for the whole networks is the average CC for each individual vertex; # Haiyuan Yu # 04/12/2003 my $inter_file = $ARGV[1]; my %read; my %NLinks; my ($aa, $bb, $cc); my ($DATA, $OUT); my $PE; my $N; my %inter; my %prots; my $total; my $CC; my $Num; my $ave; # Read in interaction information open (DATA, "$inter_file") || die "Unable to open the input file"; %read = (); while () { chomp; ($aa, $bb) = split; $aa =~ s/\-//g; $aa =~ tr/[a-z]/[A-Z]/; $bb =~ s/\-//g; $bb =~ tr/[a-z]/[A-Z]/; if (!exists $inter{$aa}{$bb}) { $NLinks{$aa} = 0 if (!exists $NLinks{$aa}); $NLinks{$bb} = 0 if (!exists $NLinks{$bb}); $NLinks{$aa} ++; $NLinks{$bb} ++; $inter{$aa}{$bb} = $inter{$bb}{$aa} = 1; $prots{$aa} = $prots{$bb} = 1; } } close (DATA); # Calculate the CC; open (OUT, ">./Net_Output_CC.txt"); for $aa (sort keys %prots) { if ($NLinks{$aa} > 1) { $PE = $NLinks{$aa} * ($NLinks{$aa} - 1); # $PE: Possible Edges; %read = (); $N = 0; for $bb (sort keys %{$inter{$aa}}) { if ($bb ne $aa) { $read{$bb} = 1; for $cc (sort keys %{$inter{$aa}}) { if (($cc ne $bb) && ($cc ne $aa) && (!exists $read{$cc})) { $N ++ if (exists $inter{$bb}{$cc}); } } } } $CC = $N / $PE; if ($CC > 1) { print "Something wrong with $aa\t$NLinks{$aa}\t$CC\t$N\n"; exit; } print OUT "$aa\t$NLinks{$aa}\t$CC\t$N\n"; $total += $CC; $Num ++; } } close (OUT); $ave = $total / $Num; } sub table_diameter { # To calculate the distance between protein pairs within interaction networks; # Haiyuan Yu # 03/27/2003 my $dir = "results"; # Where to store the results; my $infile=$ARGV[1]; my ($aa, $bb, $cc); my ($DATA, $ALL); my %prots; my (@level, @nlevel); my $i; my %overlap; my $out; my %Inter; my %read; # Read in interaction information open (DATA, "<$infile") || die "Unable to open the input file"; while () { chomp; ($aa, $bb) = split; $aa =~ s/\-//g; $aa =~ tr/[a-z]/[A-Z]/; $bb =~ s/\-//g; $bb =~ tr/[a-z]/[A-Z]/; $prots{$aa} = 1; $prots{$bb} = 1; $Inter{$aa}{$bb} = $Inter{$bb}{$aa} = 1; } close (DATA); # Calculate the distance between protein pairs using greedy algorithm; open (ALL, ">Net_Distance_all.txt"); for $aa (sort keys %prots) { %overlap = (); $overlap{$aa} = 1; $read{$aa} = 1; $i = 0; @level = (); $level[0] = $aa; while ($#level >= 0) { $i ++; @nlevel = (); for $bb (@level) { for $cc (sort keys %{$Inter{$bb}}) { if (!exists $overlap{$cc}) { push @nlevel, $cc; print ALL "$aa\t$cc\t$i\n" if (!exists $read{$cc}); } $overlap{$cc} = 1; } } @level = (); push @level, @nlevel; } } close (ALL); }