Merge pull request #97 from netsage-project/new-disciplines

New disciplines
netsage-project · Oct 23, 2019 · aa0157d · aa0157d
2 parents ace3c7b + 3117c0c
commit aa0157d
Show file tree

Hide file tree

Showing 15 changed files with 1,024 additions and 1,060 deletions.
diff --git a/bin/resourcedb-export.pl b/bin/resourcedb-export.pl
@@ -12,35 +12,44 @@
 use Data::Dumper;
 use Encode;
 
-# This script will pull data out of the Science Registry database and write it to a .json file (used by resourcedb-make-mmdb.pl)
-# AND to a .yaml file (for use with logstash translate filter)
+# This script will pull data out of the Science Registry database and write it to a
+# .json file (used by resourcedb-make-mmdb.pl which creates the fake geoip mmdb file used for logstash SciReg tagging!! )
+# AND to a .yaml file (used with logstash translate filter at one point; maybe still useful someday)
+# AND to a .csv file  (to parse or send to a human; delimiter is | )
+# RUNS FROM CRON
+# ~10/23/19 - Filter out resources with discipline = "Unknown" or "non-science" as we don't want to tag flows with those disciplines.
 
 # Defaults
 my $help;
 # Use same config file as resourcedb (Science Registry)
 my $config_file = "/etc/grnoc/netsage/resourcedb/config.xml";
 # Name output file with the current timestamp
-my $output_file = "/etc/grnoc/netsage/resourcedb/datadump_".time().".json";
+my $output_file = "/etc/grnoc/netsage/resourcedb/scireg_".time().".json";
 
 #-----------------------------
 sub usage() {
-  print "  USAGE: perl resourcedb-export.pl [-c <config file>] [-o <output file>] [-h] 
-  Without parameters, the defaults are 
-    config_file = /etc/grnoc/netsage/resourcedb/config.xml 
-    output_file = /etc/grnoc/netsage/resourcedb/datadump_<timestamp>.yaml (.json file will have same name. Must run as sudo) \n";
+  print "  USAGE: perl resourcedb-export.pl [-c/--config <config file>] [-o/==output <output-dir/filename.json> [-h/--help]
+  Without parameters, the defaults are
+    config_file = /etc/grnoc/netsage/resourcedb/config.xml
+    output = /etc/grnoc/netsage/resourcedb/scireg_<timestamp>.json (.yaml and .csv are written too)
+    (.yaml and .csv files will have same name. Must run as sudo) \n";
   exit;
 }
 #-----------------------------
 
 # defaults can be overridden on command line (-c and -o)
 GetOptions( 'config|c=s' => \$config_file,
-            'output|o=s' => \$output_file,
-            'help|h|?' => \$help 
+        'output|o=s' => \$output_file,
+            'help|h|?' => \$help
           );
 
 # did they ask for help?
 usage() if $help;
 
+if ( $output_file !~ /\.json/) {
+    print "your output filename has to end in .json\n";
+    die;
+}
 
 # Read config file to get db connection info
 if (! -f $config_file) {
@@ -64,23 +73,34 @@ ()
 my $host     = $config->get( '/config/database-host' );
 my $port     = $config->get( '/config/database-port' );
 
+# csv file - columns wanted, in order. Values are db columns/hash keys
+my @csv_columns = ("ip_block_id", "addresses_str", "org_name", "resource", "description", "country_code", "discipline", "role");
+my $csv_header = "resource_id|IPs|org|resource|description|country_code|discipline|role \n";
+
 # Try to open output files
-#json file
-$output_file =~ s/yaml$/json/; # just in case
+# json file
 my $fh;
 if (! open($fh, '>', $output_file) ) {
     print "Could not open json output file $output_file\n";
     die;
 }
-#yaml file
-my $output_file_yaml = $output_file; 
-$output_file_yaml =~ s/json$/yaml/;  
+# yaml file
+my $output_file_yaml = $output_file;
+$output_file_yaml =~ s/json$/yaml/;
 my $fh_yaml;
 if (! open($fh_yaml, '>', $output_file_yaml) ) {
     print "Could not open yaml output file $output_file_yaml\n";
     die;
 }
-
+# csv file
+my $output_file_csv = $output_file;
+$output_file_csv =~ s/json$/csv/;
+my $fh_csv;
+if (! open($fh_csv, '>', $output_file_csv) ) {
+    print "Could not open csv output file $output_file_csv\n";
+    die;
+}
+print $fh_csv $csv_header;
 
 # Connect to db
 my $dbq = GRNOC::DatabaseQuery->new(
@@ -95,10 +115,11 @@ ()
 if(!$conn_res){
     die ("Error connecting to mysql.");
 }
-# tells dbq to expect/use unicode which is what's in the db 
+# tells dbq to expect/use unicode which is what's in the db
 $dbq->{'dbh'}->do("SET NAMES utf8mb4;");
 
 # Get info about resources
+# FILTER OUT resources with discipline = "Unknown" or "non-science" AS WE DON'T WANT TO TAG FLOWS WITH THOSE DISCIPLINES.
 my $resources = $dbq->select(
     table => 'ip_block JOIN organization ON ip_block.organization_id = organization.organization_id '.
              'JOIN discipline ON ip_block.discipline_id = discipline.discipline_id '.
@@ -123,7 +144,8 @@ ()
                 'organization.latitude  as org_latitude',
                 'organization.longitude as org_longitude',
                 'organization.country_code  as org_country_code'
-               ]
+               ],
+    where => { 'discipline.name' => [ -and => {'!=','Unknown'}, {'!=','non-science'} ] }
     );
 
 if (!$resources) {
@@ -154,13 +176,21 @@ ()
     foreach my $proj (@$projects) {
         push(@{$res->{'projects'}}, $proj);
     }
-    ### print Dumper($res); ###
+    ###print Dumper($res); ###
 
     # For json file
     push(@all_resources, $res);
 
+    # For csv file
+    my $line = "";
+    foreach my $col (@csv_columns) {
+        $line = $line . $res->{$col} . '|';
+    }
+    $line =~ s/\n/  /g;
+    $line = $line."\n";
+
     # For yaml file
-    # strip /xx's from addresses and expand any ip blocks 
+    # strip /xx's from addresses and expand any ip blocks
     my @ip_array = split(",", $res->{'addresses_str'});
     my @final_ips;
     foreach my $ip (@ip_array) {
@@ -175,7 +205,7 @@ ()
             push(@final_ips, $ipblock->addr());
         } elsif ($slash < 28 or ($slash >32 and $slash < 124)) {
             # if there are too many ip's in the block, write a regular expression that matches ip's in the block
-            # NetAddr::IP -> re() - Returns a Perl regular expression that will match an IP address within the given subnet. 
+            # NetAddr::IP -> re() - Returns a Perl regular expression that will match an IP address within the given subnet.
             # Defaults to ipV4 notation. Will return an ipV6 regex if the address in not in ipV4 space.
             push(@final_ips, $ipblock->re());
         } else {
@@ -188,20 +218,25 @@ ()
         }
     }
     # remove dups and join array elements with |
-    @final_ips = uniq(@final_ips); 
+    @final_ips = uniq(@final_ips);
     my $ip_regex = join( "|", @final_ips );
 
-    # convert resource info into a json string, then convert that (which is utf8 from the db) to perl characters 
-    # with internal utf-8 tags, using decode. 
+    # convert resource info into a json string, then convert that (which is utf8 from the db) to perl characters
+    # with internal utf-8 tags, using decode.
     my $res_json = decode( 'utf-8', encode_json($res) );
-    # html-encode simple single quotes since we'll use them to start and end the string that holds the json and escaping them 
+    # html-encode simple single quotes since we'll use them to start and end the string that holds the json and escaping them
     # doesn't work ("s are already escaped)
     $res_json =~ s/'/&apos;/g;
 
     # write  ip:'data'  line to file (perl knows how to write utf8 chars)
     print $fh_yaml  "'".$ip_regex."' : '".$res_json."'\n" ;
+
+    # csv file line
+    print $fh_csv $line;
 }
+
 close($fh_yaml);
+close($fh_csv);
 
 # Add an array of IPs in addition to the addresses string (for the old pipeline scireg tagger)
 foreach my $resrc (@all_resources) {
@@ -214,5 +249,4 @@ ()
 print  $fh $json ;
 close($fh);
 
-print "Wrote $output_file and $output_file_yaml \n";
-
+print "Wrote $output_file and $output_file_yaml and $output_file_csv \n";
diff --git a/bin/resourcedb-init-db b/bin/resourcedb-init-db
@@ -1,5 +1,9 @@
 #!/usr/bin/perl
 
+# This is a script to run when installing Science Registry for the first time.
+# It will create database users for the scripts to use and write /etc/grnoc/resourcedb/config.xml
+# RUN MANUALLY with sudo
+
 use strict;
 use warnings;
 

diff --git a/bin/resourcedb-make-mmdb.pl b/bin/resourcedb-make-mmdb.pl
@@ -6,6 +6,7 @@
 # Each db entry is for an individual cidr address. They are sorted so the longest prefixes (most specific addresses) come last, since
 # the logstash GEOIP FILTER gets the LAST MATCH for an IP address.
 # If successful, this script writes a timestamp to status.txt in /var/lib/grnoc/scienceregistry-mmdb-file/status.txt.
+# RUNS VIA CRON
 
 # see https://blog.maxmind.com/2015/09/29/building-your-own-mmdb-database-for-fun-and-profit/
 # also https://stackoverflow.com/questions/47655730/maxmind-writer-to-create-custom-database-to-use-with-geoip-in-elk-stack

diff --git a/bin/resourcedb-update-db b/bin/resourcedb-update-db
@@ -1,5 +1,9 @@
 #!/usr/bin/perl
 
+# When first installing the Science Registry (resourcedb), and after every upgrade, 
+# run a script to set up the database tables and/or make any changes needed.
+# RUN MANUALLY
+
 use strict;
 use warnings;
 
@@ -11,6 +15,7 @@ use DBI;
 use constant DEFAULT_CONFIG_FILE => '/etc/grnoc/netsage/resourcedb/config.xml';
 
 sub main {
+
     my $config = GRNOC::Config->new(
         config_file => DEFAULT_CONFIG_FILE,
         force_array => 0
@@ -30,11 +35,12 @@ sub main {
 
     my ($version, $err) = $db_util->install_database();
     if (defined $err) {
-        warn "$err";
+        print "error: $err \n";
         return;
     }
 
     return 1;
 }
 
 main();
+    print " DONE \n";
diff --git a/lib/GRNOC/NetSage/ResourceDB.pm b/lib/GRNOC/NetSage/ResourceDB.pm
@@ -13,7 +13,7 @@ package GRNOC::NetSage::ResourceDB;
 use strict;
 use warnings;
 
-our $VERSION = '0.11.0';
+our $VERSION = '0.12.0';
 
 sub new {