-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtransaction_data.py
58 lines (46 loc) · 2.12 KB
/
transaction_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import pandas as pd
import numpy as np
from pydbgen import pydbgen
import uuid
import csv
import os
import glob
import datetime
class DataGenerator:
def __init__(self, datasetSize = 1):
self.pyDb = pydbgen.pydb()
self.datasetSize = datasetSize
def getBookingAmount(self, size):
return (np.random.randint(100,4000, size=size))
def getBookingID(self, size):
return [str(uuid.uuid4()) for _ in range(size)]
# Generate complete dataset with properties containing array of described values
def genDataset(self):
size = self.datasetSize
data = {
'BookingAmount' : self.getBookingAmount(size),
'BookingID' : self.getBookingID(size)
}
return (pd.DataFrame(data))
mycsvdir = 'bounced_data/'
csvfiles = glob.glob(os.path.join(mycsvdir, '*.csv'))
dataframes = []
for csvfile in csvfiles:
df = pd.read_csv(csvfile, usecols = ["UserName", "BouncedAt", "TimeStamp", "CheckIn", "CheckOut", "CityName", "CityId"])
# df[‘header’] = os.path.basename(csvfile).split(‘.’)[0]
dataframes.append(df)
BouncedDataDF = pd.concat(dataframes, ignore_index=True)
BouncedDataDF_filtered = BouncedDataDF[BouncedDataDF['BouncedAt'] == 0]
BouncedDataDF_filtered = BouncedDataDF_filtered.reset_index()
DFsize = len(BouncedDataDF_filtered)
myDataGen = DataGenerator(DFsize)
myDataFrame = myDataGen.genDataset()
BouncedDataDF_filtered.loc[:, 'RoomNights'] = (pd.to_datetime(BouncedDataDF_filtered['CheckOut'], format="%Y-%m-%d") - pd.to_datetime(BouncedDataDF_filtered['CheckIn'], format="%Y-%m-%d")).dt.days
BouncedDataDF_filtered.loc[:, 'BookingAmount'] = myDataFrame['BookingAmount']
BouncedDataDF_filtered.loc[:, 'BookingID'] = myDataFrame['BookingID']
df = BouncedDataDF_filtered.drop(columns=['index', 'BouncedAt'])
#add hotelname to the file
hotelname = pd.read_csv('other_data/city_hotel_mapping.csv')['HotelName'].to_numpy()
df.insert(loc = 4, column = 'HotelName', value = np.random.choice(hotelname, size = DFsize, replace=True))
filename = 'transaction_data/transaction_data_%s.csv' % datetime.datetime.now().strftime('%s')
df.to_csv(filename, index = False)