Merge pull request #147 from griidc/release/5.11.0

Release/5.11.0
griidc · Jan 29, 2019 · e64650e · e64650e
2 parents 7173d46 + 2cf4676
commit e64650e
Show file tree

Hide file tree

Showing 25 changed files with 868 additions and 29 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -53,7 +53,6 @@ before-install:
 # Install composer dependencies,
 # Create schema and fixtures
 install:
-  - pear install pear/PHP_CodeSniffer-2.5.1
   - export SYMFONY_ENV=drupal_dev
   - composer install
   - phpenv rehash
@@ -65,7 +64,9 @@ before-script:
 # Run script
 script:
   - phpunit
-  - phpcs --standard=GRIIDC --colors --ignore=bak --extensions=php -n -s ./src
+  - git clone --depth=1 https://github.com/squizlabs/PHP_CodeSniffer.git
+  - ./PHP_CodeSniffer/bin/phpcs --version
+  - ./PHP_CodeSniffer/bin/phpcs --config-set installed_paths ./src/GRIIDC/ --colors --ignore=bak --extensions=php -i -n -s ./src/
 
 notifications:
   email: false

diff --git a/README.md b/README.md
@@ -2,26 +2,22 @@
 
 Pelagos is a system for maintaining a repository of scientific research data.
 Developed and maintained by the The Gulf of Mexico Research Initiative Information and Data Cooperative (GRIIDC).
-[URL] (https://data.gulfresearchinitiative.org/)
+URL: https://data.gulfresearchinitiative.org/
 ## Getting Started
 
 These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. See deployment for notes on how to deploy the project on a live system.
 
 ### Prerequisites
 
-* [CENTOS 7](https://wiki.centos.org/)
-* [PHP 5.6](http://php.net/docs.php )
+* [CENTOS 6/7](https://wiki.centos.org/)
+* [PHP 7.1](http://php.net/docs.php )
 * [Symfony 3.4](https://symfony.com/doc/3.4/index.html) - PHP framework for web application
 * [PostgreSQL 9.6](https://www.postgresql.org/docs/9.6/static/release-9-6.html) - ORDBMS
 * [FOSElasticaBundle](https://github.com/FriendsOfSymfony/FOSElasticaBundle) - PHP integration for Symfony with ElasticSearch
 * [RabbitMQ](https://www.rabbitmq.com/documentation.html) - Open source message broker software
 
 ### Installation
 
-## Documentation
-
-## Roadmap
-
 ## Contributors
 
 * **Michael Van Den Eijnden**  - (2012 - present) [Github](https://github.com/mickel1138)

diff --git a/share/bash/create-manifest.sh b/share/bash/create-manifest.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+#
+#  Creates zip file with manifest files.
+#
+
+if [ "$1" == "" ]
+    then
+        echo "No argument provided!"
+    fi
+
+for arg in "$@"
+do
+    if [ "$arg" == "--help" ] || [ "$arg" == "-h" ]
+    then
+        echo "Help argument detected."
+    fi
+done
+path=`pwd`
+udi=`echo $1 | grep -oP "([A-Za-z0-9]{2}.x[0-9]{3}.[0-9]{3})[.:]([0-9]{4})"`
+
+echo "Processing UDI: $udi"
+
+readmefile="$udi-ReadMe.txt"
+manifestfile="$udi-file-manifest.txt"
+zipfile="$udi-manifest.zip"
+
+echo "Generating file: $path/$udi-ReadMe.txt"
+python share/python/create-tree.py -d $1 > $readmefile
+unix2dos $readmefile
+
+echo "Generating file: $path/$udi-file-manifest.txt"
+python share/python/create-tree.py $1 > $manifestfile
+unix2dos $manifestfile
+
+zip $zipfile $1
+zip $zipfile -m $manifestfile $readmefile
+unzip -l $zipfile
diff --git a/share/python/create-tree.py b/share/python/create-tree.py
@@ -0,0 +1,174 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+import csv
+import argparse
+import operator
+import os
+import re
+import sys
+import textwrap
+from directory_tree_node import DirectoryTreeNode
+from collections import OrderedDict
+
+def check_header(filename):
+    # This function checks for a valid hashdeep header
+    # in the passed file. If the header is valid, the
+    # invocation path is returned, otherwise false
+    # is returned in the event of an invalid header.
+
+    # example header:
+
+    # %%%% HASHDEEP-1.0
+    # %%%% size,md5,sha256,filename
+    # ## Invoked from: /mnt/LTFS/R1.x137.108.0001
+    # ## $ hashdeep -r .
+    # ##
+
+    with open(filename) as f:
+        first = f.readline().rstrip() == '%%%% HASHDEEP-1.0'
+        second = f.readline().rstrip() == '%%%% size,md5,sha256,filename'
+        third_line = f.readline().rstrip()
+        third = re.match('^## Invoked from: ', third_line) != None
+        fourth_line = f.readline().rstrip()
+        # This hardcoded offset is safe because of the previous re.match() check.
+        path = third_line[17:]
+        forth = re.match('^## \$ hashdeep -r ', fourth_line) != None
+        fifth = f.readline().rstrip() == '##'
+    if (first and second and third and forth and fifth):
+        return path
+    else:
+        return None
+
+# https://www.oreilly.com/library/view/python-cookbook/0596001673/ch04s16.html
+def splitall(path):
+    allparts = []
+    while 1:
+        parts = os.path.split(path)
+        if parts[0] == path:  # sentinel for absolute paths
+            allparts.insert(0, parts[0])
+            break
+        elif parts[1] == path: # sentinel for relative paths
+            allparts.insert(0, parts[1])
+            break
+        else:
+            path = parts[0]
+            allparts.insert(0, parts[1])
+    return allparts
+
+
+def generate_tree(filename, short):
+    path = check_header(filename)
+    filetypes = {}
+    # Extract the UDI from the passed starting path.
+    udi_pattern = re.compile('([A-Za-z0-9]{2}.x[0-9]{3}.[0-9]{3})[.:]([0-9]{4})')
+    udi_parts = udi_pattern.findall(path)
+    udi = udi_parts[0][0] + ':' + udi_parts[0][1]
+
+    if (path is not None):
+        sizes = OrderedDict()
+        with open(filename, 'rb') as f:
+            reader = csv.reader(f)
+            rownum = 1
+            last = None
+            for row in reader:
+                # skip header
+                if (rownum > 5):
+                    object_filename = re.sub(path + '/', '', row[3])
+                    object_size = row[0]
+
+                    # Find file's filetype, add to count by filetype.
+                    filetype = os.path.splitext(row[3])[1]
+                    try:
+                        filetypes[filetype] += 1
+                    except KeyError:
+                        filetypes[filetype] = 1
+
+                    # Split out paths to keep track, by dir, of totals.
+                    parts = splitall(object_filename)
+                    for i in range (0, len(parts), 1):
+                        if (i == 0):
+                            my_str = parts[i]
+                        elif (i < len(parts)-1):
+                            my_str = my_str + '/' + parts[i]
+                        else:
+                            # Appending '|EOL:' to ends of non-file paths, so this indicates directories.
+                            # elegance-- but works.
+                            my_str = my_str + '/' + parts[i] + '|EOL:'
+                        try:
+                            sizes[my_str] += int(object_size)
+                        except KeyError:
+                            sizes[my_str] = int(object_size)
+                rownum += 1
+            # Output Section
+            if (short):
+                print "Dataset Directory Summary for " + udi
+            else:
+                print "Dataset File Manifest for " + udi
+            print textwrap.dedent("""\
+
+                This dataset is greater than 25 GB and therefore too large to be downloaded
+                through direct download. In order to obtain this dataset, please email
+                [email protected] to make arrangements. If you would like a subset of the
+                dataset files, please indicate which directories and/or files.
+
+                """)
+            # Display filetype summary in short mode.
+            if (short):
+                extensions = []
+                for file_type, type_count in filetypes.iteritems():
+                    if (file_type == ''):
+                        file_type = 'no extension'
+                    extensions.append(file_type)
+                print("File Extensions:")
+                extensions.sort()
+                print(','.join(extensions))
+                print
+
+                # Sort by count in each type, descending.
+                for file_type, type_count in sorted(filetypes.iteritems(), reverse=True, key=lambda (k,v): (v,k)):
+                    if(file_type == ''):
+                        file_type = '<no extension>'
+                    formatted_line = '%10s  %15s' % (str(type_count), file_type)
+                    print formatted_line
+                print
+                print("Total Files - " + str(rownum-5-1))
+            print
+            if (short):
+                print('Directories Structure:')
+            else:
+                print('File Listing:')
+            print
+
+            for path, size in sizes.iteritems():
+                if (short):
+                    # Display directories only in short mode.
+                    if(re.search("\|EOL:$", path)):
+                        pass
+                    else:
+                        opPath = re.sub('\|EOL:', '', path)
+                        DirectoryTreeNode.buildTree(directoryTreeNodeRoot, opPath, size)
+                else:
+                    opPath = re.sub('\|EOL:', '', path)
+                    DirectoryTreeNode.buildTree(directoryTreeNodeRoot, opPath, size)
+            # print the tree starting with the node(s) that
+            # are children of the root. The root does not contain data.
+            rootChildren = directoryTreeNodeRoot.getChildren()
+            for child in rootChildren:
+                child.printTree(0)
+    else:
+        print("Error in header. Stopping")
+
+
+directoryTreeNodeRoot = DirectoryTreeNode('root',0)
+
+def main(argv, script_name):
+    parser = argparse.ArgumentParser()
+    # Stores args.d boolean, true if -d is set, false otherwise.
+    parser.add_argument('-d', action='store_true', help='Print only directories.')
+    parser.add_argument('hashfile')
+    args = parser.parse_args()
+    generate_tree(args.hashfile, args.d)
+
+if __name__ == "__main__":
+    main(sys.argv[1:], sys.argv[0])
+