Skip to content

Commit

Permalink
Cleanup postprocess scripts
Browse files Browse the repository at this point in the history
- Add unique name to crawlers for better distinction
- Remove unit test leftovers
- Remove wrong DNS hierarchy script
- Fix log file name of url2hostname script
  • Loading branch information
m-appel committed Dec 19, 2024
1 parent 4927e4e commit f246e32
Show file tree
Hide file tree
Showing 8 changed files with 48 additions and 143 deletions.
3 changes: 2 additions & 1 deletion create_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,11 @@ def __init__(self, message):
logging.info('Post-processing...')
for module_name in conf['iyp']['post']:
module = importlib.import_module(module_name)
name = module_name.replace('iyp.post.', '')

try:
logging.info(f'start {module}')
post = module.PostProcess()
post = module.PostProcess(name)
post.run()
post.close()
status[module_name] = STATUS_OK
Expand Down
4 changes: 2 additions & 2 deletions iyp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -631,11 +631,11 @@ def batch_add_properties(self, id_prop_list):


class BasePostProcess(object):
def __init__(self):
def __init__(self, name):
"""IYP and references initialization."""

self.reference = {
'reference_name': 'iyp',
'reference_name': f'iyp.{name}',
'reference_org': 'Internet Yellow Pages',
'reference_url_data': 'https://iyp.iijlab.net',
'reference_url_info': str(),
Expand Down
21 changes: 4 additions & 17 deletions iyp/post/address_family.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from iyp import BasePostProcess

NAME = 'post.address_family'


class PostProcess(BasePostProcess):
def run(self):
Expand All @@ -19,22 +21,7 @@ def run(self):
self.iyp.tx.run("MATCH (ip:IP) WHERE ip.ip CONTAINS ':' SET ip.af = 6;")

def unit_test(self):

self.run()
# test the prefix tree for IPv4 and IPv6 and return count
result_prefix = self.iyp.tx.run(
'MATCH (pfx:Prefix) WHERE pfx.af <> 4 and pfx.af <> 6 RETURN count(pfx);').data()

# test the IP tree for IPv4 and IPv6 and return count
result_ip = self.iyp.tx.run('MATCH (ip:IP) WHERE ip.af <> 4 and ip.af <> 6 RETURN count(ip);').data()

result = result_prefix[0]['count(pfx)'] + result_ip[0]['count(ip)']
logging.info(
'Count of the remaining prefex/IP which is not IPv4 or IPv6: %s and the assert result is %s' %
(result, result == 0))
self.close()
print('assertion error ') if result != 0 else print('assertion success')
assert result == 0
raise NotImplementedError()


def main() -> None:
Expand All @@ -53,7 +40,7 @@ def main() -> None:

logging.info(f'Started: {sys.argv}')

post = PostProcess()
post = PostProcess(NAME)
if args.unit_test:
post.unit_test()
else:
Expand Down
7 changes: 6 additions & 1 deletion iyp/post/clean_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from iyp import BasePostProcess

NAME = 'post.clean_links'


class PostProcess(BasePostProcess):
def get_links_of_type(self, link_type, prop_dict=None):
Expand Down Expand Up @@ -78,6 +80,9 @@ def run(self):
for link_type in link_types:
self.clean_links_of_type(link_type, {'reference_org': 'OONI'})

def unit_test(self):
raise NotImplementedError()


def main() -> None:
parser = argparse.ArgumentParser()
Expand All @@ -95,7 +100,7 @@ def main() -> None:

logging.info(f'Started: {sys.argv}')

post = PostProcess()
post = PostProcess(NAME)
if args.unit_test:
post.unit_test()
else:
Expand Down
10 changes: 4 additions & 6 deletions iyp/post/country_information.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

from iyp import BasePostProcess

NAME = 'post.country_information'


class PostProcess(BasePostProcess):
def run(self):
Expand All @@ -32,11 +34,7 @@ def run(self):
self.iyp.commit()

def unit_test(self):
self.run()
count = self.iyp.tx.run('MATCH (n:Country) WHERE n.alpha3 IS NOT NULL RETURN COUNT(n)').single()
self.close()
print('assertion error ') if count == 0 else print('assertion success')
assert count > 0
raise NotImplementedError()


def main() -> None:
Expand All @@ -55,7 +53,7 @@ def main() -> None:

logging.info(f'Started: {sys.argv}')

post = PostProcess()
post = PostProcess(NAME)
if args.unit_test:
post.unit_test()
else:
Expand Down
95 changes: 0 additions & 95 deletions iyp/post/dns_hierarchy.py

This file was deleted.

17 changes: 4 additions & 13 deletions iyp/post/ip2prefix.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

from iyp import BasePostProcess

NAME = 'post.ip2prefix'


class PostProcess(BasePostProcess):
@staticmethod
Expand Down Expand Up @@ -74,19 +76,8 @@ def run(self):
# push sub-prefix to covering-prefix links
self.iyp.batch_add_links('PART_OF', links)

def count_relation(self):
count = self.iyp.tx.run('MATCH (ip:IP)-[r]->() RETURN count(r) AS count').single()
return count

def unit_test(self):
result_before = self.count_relation()
logging.info('relations before: %s' % result_before)
self.run()
result_after = self.count_relation()
logging.info('relations after: %s' % result_after)
self.close()
print('assertion error ') if result_after <= result_before else print('assertion success')
assert result_after > result_before
raise NotImplementedError()


def main() -> None:
Expand All @@ -105,7 +96,7 @@ def main() -> None:

logging.info(f'Started: {sys.argv}')

post = PostProcess()
post = PostProcess(NAME)
if args.unit_test:
post.unit_test()
else:
Expand Down
34 changes: 26 additions & 8 deletions iyp/post/url2hostname.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import argparse
import logging
import os
import sys

import tldextract

from iyp import BasePostProcess

NAME = 'post.url2hostname'


class PostProcess(BasePostProcess):
def run(self):
Expand Down Expand Up @@ -35,21 +39,35 @@ def run(self):
# push links to IYP
self.iyp.batch_add_links('PART_OF', links)

def unit_test(self):
raise NotImplementedError()

if __name__ == '__main__':

scriptname = sys.argv[0].replace('/', '_')[0:-3]
FORMAT = '%(asctime)s %(processName)s %(message)s'
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument('--unit-test', action='store_true')
args = parser.parse_args()

scriptname = os.path.basename(sys.argv[0]).replace('/', '_')[0:-3]
FORMAT = '%(asctime)s %(levelname)s %(message)s'
logging.basicConfig(
format=FORMAT,
filename='log/' + scriptname + '.log',
level=logging.INFO,
datefmt='%Y-%m-%d %H:%M:%S'
)
logging.info('Start: %s' % sys.argv)

post = PostProcess()
post.run()
post.close()
logging.info(f'Started: {sys.argv}')

logging.info('End: %s' % sys.argv)
post = PostProcess(NAME)
if args.unit_test:
post.unit_test()
else:
post.run()
post.close()
logging.info(f'Finished: {sys.argv}')


if __name__ == '__main__':
main()
sys.exit(0)

0 comments on commit f246e32

Please sign in to comment.