HDRUK · calmacx · Nov 3, 2021 · Feb 8, 2022
diff --git a/coconnect/io/__init__.py b/coconnect/io/__init__.py
diff --git a/coconnect/io/plugins/__init__.py b/coconnect/io/plugins/__init__.py
diff --git a/coconnect/io/plugins/local.py b/coconnect/io/plugins/local.py
@@ -0,0 +1,70 @@
+import pandas as pd
+from coconnect.tools.logger import Logger
+
+class InputData:
+    def __init__(self,chunksize):
+        self.chunksize = chunksize
+
+        self.__file_readers = {}
+        self.__dataframe = {}
+
+        self.logger = Logger(self.__class__.__name__)
+        self.logger.info("InputData Object Created")
+        if self.chunksize is not None:
+            self.logger.info(f"Using a chunksize of '{self.chunksize}' nrows")
+
+    def all(self):
+        return {
+            key:self[key]
+            for key in self.keys()
+        }
+
+    def keys(self):
+        return self.__file_readers.keys()
+
+    def next(self):
+        #loop over all loaded files
+        for key in self.keys():
+            #get the next dataframe chunk for this file
+            self.__dataframe[key] = self.get_df_chunk(key)
+
+        #check if all __dataframe objects are empty
+        #if they are, reaise a StopIteration as processing has finished
+        if all([x.empty for x in self.__dataframe.values()]):
+            self.logger.debug("All input files have now been processed.")
+            raise StopIteration
+
+        self.logger.info(f"Moving onto the next chunk of data (of size {self.chunksize})")
+
+
+    def get_df_chunk(self,key):
+        #obtain the file by key
+        obj = self.__file_readers[key]
+        #if it is a TextFileReader, get a dataframe chunk
+        if isinstance(obj,pd.io.parsers.TextFileReader):
+            try:
+                #for this file reader, get the next chunk of data and update self.__dataframe
+                return obj.get_chunk(self.chunksize)
+            except StopIteration:
+                #otherwise, if at the end of the file reader, return an empty frame
+                return pd.DataFrame(columns=self.__dataframe[key].columns)
+        else:
+            #if we're handling non-chunked data
+            #return an empty dataframe if we've already loaded this dataframe
+            if key in self.__dataframe.keys():
+                return pd.DataFrame()
+            #otherwise return the dataframe as it's the first time we're getting it
+            return obj
+
+
+    def __getitem__(self,key):
+        if key not in self.__dataframe.keys():
+            self.__dataframe[key] = self.get_df_chunk(key)
+        return self.__dataframe[key]
+
+    def __setitem__(self,key,obj):
+        if not (isinstance(obj,pd.DataFrame) or isinstance(obj,pd.io.parsers.TextFileReader)):
+            raise NotImplementedError("When using InputData, the object must be of type "
+                                      f"{pd.DataFrame} or {pd.io.parsers.TextFileReader} ")
+        self.logger.info(f"Registering  {key} [{type(obj)}]")
+        self.__file_readers[key] = obj
diff --git a/coconnect/io/plugins/spark.py b/coconnect/io/plugins/spark.py
@@ -0,0 +1,93 @@
+import pandas as pd
+from coconnect.tools.logger import Logger
+
+from pyspark.sql import SparkSession
+
+class SparkData:
+    def __init__(self,_map):
+
+        self.spark = SparkSession \
+            .builder \
+            .appName("Python Spark SQL basic example") \
+            .config("spark.some.config.option", "some-value") \
+            .getOrCreate()
+
+
+        for name,obj in _map.items():
+            fname = obj['file']
+            df = self.spark.read.option("header",True) \
+                           .csv(fname)
+            print (df)
+            exit(0)
+        print (_map)
+        exit(0)
+
+
+        self.__file_readers = {}
+        self.__dataframe = {}
+
+        self.logger = Logger(self.__class__.__name__)
+        self.logger.info("InputData Object Created")
+        if self.chunksize is not None:
+            self.logger.info(f"Using a chunksize of '{self.chunksize}' nrows")
+
+    def all(self):
+        return {
+            key:self[key]
+            for key in self.keys()
+        }
+
+    def keys(self):
+        return self.__file_readers.keys()
+
+    def next(self):
+        #loop over all loaded files
+        for key in self.keys():
+            #get the next dataframe chunk for this file
+            self.__dataframe[key] = self.get_df_chunk(key)
+
+        #check if all __dataframe objects are empty
+        #if they are, reaise a StopIteration as processing has finished
+        if all([x.empty for x in self.__dataframe.values()]):
+            self.logger.debug("All input files have now been processed.")
+            raise StopIteration
+
+        self.logger.info(f"Moving onto the next chunk of data (of size {self.chunksize})")
+
+
+    def get_df_chunk(self,key):
+        #obtain the file by key
+        obj = self.__file_readers[key]
+        #if it is a TextFileReader, get a dataframe chunk
+        if isinstance(obj,pd.io.parsers.TextFileReader):
+            try:
+                #for this file reader, get the next chunk of data and update self.__dataframe
+                return obj.get_chunk(self.chunksize)
+            except StopIteration:
+                #otherwise, if at the end of the file reader, return an empty frame
+                return pd.DataFrame(columns=self.__dataframe[key].columns)
+        else:
+            #if we're handling non-chunked data
+            #return an empty dataframe if we've already loaded this dataframe
+            if key in self.__dataframe.keys():
+                return pd.DataFrame()
+            #otherwise return the dataframe as it's the first time we're getting it
+            return obj
+
+
+    def __getitem__(self,key):
+        if key not in self.__dataframe.keys():
+            self.__dataframe[key] = self.get_df_chunk(key)
+        return self.__dataframe[key]
+
+    def __setitem__(self,key,obj):
+
+        print (key)
+        print (obj)
+        exit(0)
+
+        if not (isinstance(obj,pd.DataFrame) or isinstance(obj,pd.io.parsers.TextFileReader)):
+            raise NotImplementedError("When using InputData, the object must be of type "
+                                      f"{pd.DataFrame} or {pd.io.parsers.TextFileReader} ")
+        self.logger.info(f"Registering  {key} [{type(obj)}]")
+        self.__file_readers[key] = obj
diff --git a/coconnect/tools/file_helpers.py b/coconnect/tools/file_helpers.py
@@ -4,6 +4,9 @@
 import json
 import pandas as pd
 from coconnect.tools.logger import Logger
+from coconnect.io.plugins.local import InputData
+from coconnect.io.plugins.spark import SparkData
+
 
 class MissingInputFiles(Exception):
     pass
@@ -13,75 +16,7 @@ class DifferingRows(Exception):
     pass
 
 
-class InputData:
-    def __init__(self,chunksize):
-        self.chunksize = chunksize
-
-        self.__file_readers = {}
-        self.__dataframe = {}
-
-        self.logger = Logger(self.__class__.__name__)
-        self.logger.info("InputData Object Created")
-        if self.chunksize is not None:
-            self.logger.info(f"Using a chunksize of '{self.chunksize}' nrows")
-
-    def all(self):
-        return {
-            key:self[key]
-            for key in self.keys()
-        }
-
-    def keys(self):
-        return self.__file_readers.keys()
-
-    def next(self):
-        #loop over all loaded files
-        for key in self.keys():
-            #get the next dataframe chunk for this file
-            self.__dataframe[key] = self.get_df_chunk(key)
-
-        #check if all __dataframe objects are empty
-        #if they are, reaise a StopIteration as processing has finished
-        if all([x.empty for x in self.__dataframe.values()]):
-            self.logger.debug("All input files have now been processed.")
-            raise StopIteration
-
-        self.logger.info(f"Moving onto the next chunk of data (of size {self.chunksize})")
-
-
-    def get_df_chunk(self,key):
-        #obtain the file by key
-        obj = self.__file_readers[key]
-        #if it is a TextFileReader, get a dataframe chunk
-        if isinstance(obj,pd.io.parsers.TextFileReader):
-            try:
-                #for this file reader, get the next chunk of data and update self.__dataframe
-                return obj.get_chunk(self.chunksize)
-            except StopIteration:
-                #otherwise, if at the end of the file reader, return an empty frame
-                return pd.DataFrame(columns=self.__dataframe[key].columns)
-        else:
-            #if we're handling non-chunked data
-            #return an empty dataframe if we've already loaded this dataframe
-            if key in self.__dataframe.keys():
-                return pd.DataFrame()
-            #otherwise return the dataframe as it's the first time we're getting it
-            return obj
-
-
-    def __getitem__(self,key):
-        if key not in self.__dataframe.keys():
-            self.__dataframe[key] = self.get_df_chunk(key)
-        return self.__dataframe[key]
-
-    def __setitem__(self,key,obj):
-        if not (isinstance(obj,pd.DataFrame) or isinstance(obj,pd.io.parsers.TextFileReader)):
-            raise NotImplementedError("When using InputData, the object must be of type "
-                                      f"{pd.DataFrame} or {pd.io.parsers.TextFileReader} ")
-        self.logger.info(f"Registering  {key} [{type(obj)}]")
-        self.__file_readers[key] = obj
-
-
+
 def load_json_delta(f_in,original):
     logger = Logger("load_json_delta")
     logger.info(f"loading a json from '{f_in}' as a delta")
@@ -173,8 +108,10 @@ def load_csv(_map,chunksize=None,nrows=None,lower_col_names=False,load_path="",r
             if k in source_map
         }
 
-    retval = InputData(chunksize)
-
+    retval = SparkData(_map)
+    print (retval)
+    exit(0)
+
     for key,obj in _map.items():
         fields = None
         if isinstance(obj,str):

diff --git a/requirements.txt b/requirements.txt
@@ -16,3 +16,4 @@ sqlalchemy-utils
 pyyaml
 python-daemon
 inquirer
+pyspark
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,3 +16,4 @@ sqlalchemy-utils @@
     pyyaml
     python-daemon
     inquirer
+    pyspark