-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathinit_spark.py
30 lines (24 loc) · 964 Bytes
/
init_spark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# initialize Spark Session
def init_spark():
print("Initializing Spark...")
import findspark
findspark.init() # uses SPARK_HOME
print("Spark found in : ", findspark.find())
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
# use a unique tmep dir for warehouse dir, so we can run multiple spark sessions in one dir
import tempfile
tmpdir = tempfile.TemporaryDirectory()
# print(tmpdir.name)
config = ( SparkConf()
.setAppName("TestApp")
.setMaster("local[*]")
.set('executor.memory', '2g')
.set('spark.sql.warehouse.dir', tmpdir.name)
)
print("Spark config:\n\t", config.toDebugString().replace("\n", "\n\t"))
spark = SparkSession.builder.config(conf=config).getOrCreate()
print('Spark UI running on port ' + spark.sparkContext.uiWebUrl.split(':')[2])
return spark
## end def -- init_spark