-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdatetime_handle.py
248 lines (189 loc) · 9.17 KB
/
datetime_handle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import pandas as pd
import numpy as np
import sys
import os
import re
from datetime import datetime
def get_fulltime(series, freq='D'):
try:
start_time = series[0]
end_time = series[-1]
fulltime = pd.date_range(start_time, end_time, freq=freq)
return fulltime
except Exception as e:
raise ValueError(f"{e}")
# ------------------------------------------------------------------------------
def fulltime_table(df, fulltime_series):
"""
Create a DataFrame that combines the input DataFrame with the missing dates from a full time series.
Parameters:
- df (pd.DataFrame): The input DataFrame with a datetime index.
- fulltime_series (pd.DatetimeIndex): A series of dates representing the complete time range.
Returns:
- pd.DataFrame: A DataFrame with the full time series, including null rows for missing dates.
Raises:
- ValueError: If the data types of the DataFrame index and full time series do not match.
"""
if isinstance(df.index[0], type(fulltime_series[0])):
# Calculate the remaining dates after removing those already in the df index.
df_indexes = set(df.index)
fulltime_series = set(fulltime_series)
remaining_dates = sorted(fulltime_series.difference(df_indexes))
# Create a null DataFrame with the remaining dates as its index.
null_table = pd.DataFrame(
data=None,
columns=df.columns,
index=remaining_dates
)
# Concatenate the original DataFrame with the null table and sort by index.
combined_df = pd.concat([df, null_table]).sort_index()
return combined_df
else:
raise ValueError("Data types of DataFrame index and input series do not match")
# ------------------------------------------------------------------------------
def convert_to_datetime(colname):
"""
Convert a string to a datetime object based on specific validation rules.
Parameters:
- colname (str): The input string to be converted.
Returns:
- datetime: A datetime object representing the date.
Raises:
- ValueError: If the input string does not meet the required format.
"""
# Check if the input contains alphabetical characters
match = re.search(r'[A-Za-z]', colname)
if match:
# Get the last alphabetical character's index and check the remaining string
last_alpha_idx = match.end() - 1
# Extract the part after the last alphabetical character
numeric_part = colname[last_alpha_idx + 1:]
# Check if the remaining part has exactly 8 digits
if len(numeric_part) == 8 and numeric_part.isdigit():
return pd.to_datetime(numeric_part, format='%Y%m%d')
else:
raise ValueError("Input must end with 'YYYYMMDD' after letters.")
# If no alphabetical characters, check if the string is 8 digits
elif colname.isdigit() and len(colname) == 8:
return pd.to_datetime(colname, format='%Y%m%d')
else:
raise ValueError("Input must be 'YYYYMMDD' or contain letters followed by 'YYYYMMDD'.")
# ------------------------------------------------------------------------------
def datetime_to_string(date, initial_char='N'):
"""
Convert a datetime object to a formatted string with an optional prefix.
Parameters:
- date (datetime): The datetime object to convert.
- initial_char (str): The characters to prefix the date string. Default is 'N'.
Returns:
- str: The formatted date string in the form of initial_char + 'YYYYMMDD'.
Raises:
- ValueError: If the input initial_char contains non-alphabetical characters.
- TypeError: If the date is not a datetime object.
"""
# Validate input
if not isinstance(initial_char, str) or not initial_char.isalpha():
raise ValueError("Initial character(s) must only contain alphabetical characters.")
if not isinstance(date, (pd.Timestamp, pd.DatetimeIndex, pd.Timestamp)):
raise TypeError("The date must be a datetime object.")
# Convert datetime to string in 'YYYYMMDD' format
date_str = date.strftime('%Y%m%d')
# Combine initial character with the formatted date
return f"{initial_char}{date_str}"
# ------------------------------------------------------------------------------
def intersect_time_index(df1_index, df2_index):
"""
Finds the intersection of two time indices.
Args:
df1_index (iterable): An iterable of time indices (e.g., list, set, pandas Index) for the first dataset.
df2_index (iterable): An iterable of time indices (e.g., list, set, pandas Index) for the second dataset.
Returns:
list: A sorted list of the common elements in both time indices.
Raises:
TypeError: If either input is not an iterable.
ValueError: If either input is empty.
"""
try:
# Ensure inputs are iterables that can be converted to sets
_a = set(df1_index)
_b = set(df2_index)
except TypeError as e:
raise TypeError("Both inputs must be iterables (e.g., list, set, pandas Index).") from e
if not _a:
raise ValueError("The first input time index is empty.")
if not _b:
raise ValueError("The second input time index is empty.")
# Find intersection and sort the result
intersection = sorted(list(_a.intersection(_b)))
return intersection
# ------------------------------------------------------------------------------
def extract_datetime_from_mfile(mfile):
"""
Extract unique datetime components from filenames in a specified mfile.
Parameters:
- mfile (str): Path to the mfile containing list of filenames.
Returns:
- list: Sorted list of unique datetime components extracted from filenames.
"""
# Read lines from the mfile and strip whitespace/newline characters
with open(mfile, "r") as input_file:
lines = [line.strip() for line in input_file]
# Extract basenames (without extension) from the lines
basenames = [os.path.basename(os.path.splitext(line)[0]) for line in lines]
# Use a set comprehension to collect unique datetime components
datetimes = {"N"+name.split("_")[-2][3:] for name in basenames}.union({"N"+name.split("_")[-1][3:] for name in basenames})
# Return a sorted list of unique datetime components
return sorted(datetimes)
# ------------------------------------------------------------------------------
def numeric_time_index(time_series):
"""
Generate a numeric time index for a given time series, excluding null values.
Parameters:
time_series (pandas.Series): A pandas Series with a DatetimeIndex, which may contain null values.
Returns:
numpy.ndarray: An array of numeric indices corresponding to the non-null values in the input time series.
"""
# Create a boolean filter for non-null values in the time series
non_null_filter = time_series.notna()
# Generate a numeric array representing the time indices
numeric_time_array = np.arange(len(time_series))
# Apply the non-null filter to the numeric time array
numeric_time_array_finite = numeric_time_array[non_null_filter]
return numeric_time_array_finite
# ------------------------------------------------------------------------------
def bytes_to_datetime(byte_string, date_format='%Y%m%d'):
"""
Convert a byte string representing a date into a datetime object.
Parameters:
- byte_string (bytes): A byte string that contains a date in a specific format.
For example, b'20041123' for the date '2004-11-23'.
- date_format (str): The format of the date in the byte string. Default is '%Y%m%d',
which corresponds to 'YYYYMMDD'.
Returns:
- datetime: A datetime object representing the date encoded in the byte string.
Raises:
- TypeError: If the input is not a byte string.
- ValueError: If the byte string cannot be decoded into a string, or if the date string
does not match the expected format.
Example:
>>> bytes_to_datetime(b'20041123')
datetime.datetime(2004, 11, 23, 0, 0)
>>> bytes_to_datetime(b'23-11-2004', '%d-%m-%Y')
datetime.datetime(2004, 11, 23, 0, 0)
"""
# Check if the input is a byte string
if not isinstance(byte_string, bytes):
raise TypeError("Input must be of type 'bytes', but got type '{}'.".format(type(byte_string).__name__))
try:
# Decode the byte string to a regular string using UTF-8 encoding
date_string = byte_string.decode('utf-8')
except UnicodeDecodeError as e:
raise ValueError("Failed to decode byte string. Ensure the byte string is properly encoded in UTF-8.") from e
try:
# Convert the string to a datetime object using the specified format
date_obj = datetime.strptime(date_string, date_format)
except ValueError as e:
raise ValueError("The date string '{}' does not match the expected format '{}'."
" Please ensure the format corresponds to the date structure.".format(date_string, date_format)) from e
return date_obj
# ------------------------------------------------------------------------------