Skip to content

Commit

Permalink
Fix missing header case
Browse files Browse the repository at this point in the history
  • Loading branch information
steffsas committed Sep 16, 2024
1 parent efd6a7a commit d086303
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 3 deletions.
3 changes: 2 additions & 1 deletion .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ output/
.venv/
.gitignore
LICENSE
README.md
README.md
.vscode/
41 changes: 40 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from datetime import datetime
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from ipaddress import ip_address, IPv6Address

URL_ENV = "URL"
LOG_FOLDER_ENV = "LOG_FOLDER"
Expand Down Expand Up @@ -146,9 +147,12 @@ def download():
ips = pd.read_csv(outputFile)
outputTxt = f"{outputFolder}/ipv6-udp-{date}.txt"

# let's prepare the columns, add column names if necessary
ips = prepareColumns(ips)

logger.info("got {0} ips, removing duplicates...".format(len(ips)))

ips = ips[ips["success"] == 1][
ips = ips[(ips["success"] == 1) | (ips["success"] == "1")][
# for some reason saddr is the original destination
["saddr"]
].drop_duplicates(ignore_index=True, inplace=False)
Expand Down Expand Up @@ -221,5 +225,40 @@ def extractDateFromFilename(filename: str) -> str:
logger.error("no date found in filename")
return ""

def prepareColumns(df: pd.DataFrame) -> pd.DataFrame:
DEFAULT_COLUMN_LEN = 13

if "success" not in df.columns or "saddr" not in df.columns:
ipv6_dest = df.columns[0]
success = df.columns[-1]

# check whether entry is an ipv6 address
if isIpv6(ipv6_dest) and (success == "1" or success == 1):
logger.info("indicated header as valid hit, let's add it to the dataframe...")
df.loc[len(df)] = [col for col in df.columns]

if len(df.columns) == DEFAULT_COLUMN_LEN:
df.columns = ['saddr', 'daddr', 'ipid', 'ttl', 'sport', 'dport', 'classification',
'repeat', 'cooldown', 'timestamp_ts', 'timestamp_us', 'data',
'success']
else:
# we define the last column as the success column
# and the first column as the ip address
cols = [f"col{i}" for i in range(len(df.columns))]
cols[0] = "saddr"
cols[-1] = "success"

df.columns = cols
return df

def isIpv6(addr: str) -> bool:
try:
addr = ip_address(addr)
if type(addr) is IPv6Address:
return True
return False
except ValueError:
return False

if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ six==1.16.0
sniffio==1.3.1
soupsieve==2.5
typing_extensions==4.12.2
tzdata==2024.1
tzdata==2024.1

0 comments on commit d086303

Please sign in to comment.