Skip to content

Commit

Permalink
Merge pull request #456 from Christovis/collectmail_for_listserv
Browse files Browse the repository at this point in the history
Collectmail for listserv
  • Loading branch information
sbenthall authored May 6, 2021
2 parents 2785d2e + aed908f commit 304dd26
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 25 deletions.
32 changes: 22 additions & 10 deletions bigbang/listserv.py
Original file line number Diff line number Diff line change
Expand Up @@ -834,7 +834,7 @@ def to_mbox(self, dir_out: str, filename: Optional[str] = None):
filepath = f"{dir_out}/{self.name}.mbox"
else:
filepath = f"{dir_out}/{filename}.mbox"
logger.info(f"The list {self.name} is save at {filepath}.")
logger.info(f"The list {self.name} is saved at {filepath}.")
first = True
for msg in self.messages:
if first:
Expand Down Expand Up @@ -961,6 +961,7 @@ def from_mailing_lists(
login: Optional[Dict[str, str]] = {"username": None, "password": None},
session: Optional[str] = None,
only_mlist_urls: bool = True,
instant_save: Optional[bool] = True,
) -> "ListservArchive":
"""
Create ListservArchive from a given list of 'ListservList'.
Expand All @@ -975,15 +976,23 @@ def from_mailing_lists(
if session is None:
session = get_auth_session(url_login, **login)
lists = []
for idx, url in enumerate(url_mailing_lists):
lists.append(
ListservList.from_url(
name=idx,
url=url,
select=select,
session=session,
)
for url in url_mailing_lists:
mlist_name = url.split('A0=')[-1]
mlist = ListservList.from_url(
name=mlist_name,
url=url,
select=select,
session=session,
)
if len(mlist) != 0:
if instant_save:
dir_out = CONFIG.mail_path + name
if os.path.isdir(dir_out) is False:
os.mkdir(dir_out)
mlist.to_mbox(dir_out=dir_out)
else:
logger.info(f"Recorded the list {mlist.name}.")
lists.append(mlist)
else:
lists = url_mailing_lists
return cls(name, url_root, lists)
Expand All @@ -996,7 +1005,7 @@ def from_listserv_directory(
folderdsc: str = "*",
filedsc: str = "*.LOG?????",
select: Optional[dict] = None,
) -> "ListservList":
) -> "ListservArchive":
"""
Args:
name: Name of the archive, e.g. '3GPP'.
Expand Down Expand Up @@ -1075,6 +1084,9 @@ def get_lists_from_url(
)
if len(mlist) != 0:
if instant_save:
dir_out = CONFIG.mail_path + name
if os.path.isdir(dir_out) is False:
os.mkdir(dir_out)
mlist.to_mbox(dir_out=CONFIG.mail_path)
archive.append(mlist.name)
else:
Expand Down
55 changes: 41 additions & 14 deletions bigbang/mailman.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@
import urllib.error
import urllib.parse
import urllib.request
from urllib.parse import urlparse
import warnings
from pprint import pprint as pp
from typing import Union

import pandas as pd
import yaml
Expand All @@ -27,7 +29,8 @@
gz_exp = re.compile(r'href="(\d\d\d\d-\w*\.txt\.gz)"')
ietf_ml_exp = re.compile(r'href="([\d-]+.mail)"')
w3c_archives_exp = re.compile(r"lists\.w3\.org")
listserv_archives_exp = re.compile(r"list\.etsi\.org")
tgpp_archives_exp = re.compile(r'list\.etsi\.org')
ieee_archives_exp = re.compile(r'listserv\.ieee\.org')

mailing_list_path_expressions = [gz_exp, ietf_ml_exp, txt_exp]

Expand Down Expand Up @@ -93,11 +96,12 @@ def load_data(


def collect_from_url(
url: str, archive_dir: str = CONFIG.mail_path, notes=None
url: Union[list, str], archive_dir: str = CONFIG.mail_path, notes=None
):
"""Collect data from a given url."""

url = url.rstrip()
if isinstance(url, str):
url = url.rstrip()
try:
has_archives = collect_archive_from_url(
url, archive_dir=archive_dir, notes=notes
Expand Down Expand Up @@ -160,8 +164,13 @@ def collect_from_file(
):
"""Collect urls from a file."""
urls = urls_to_collect(urls_file)
for url in urls:
collect_from_url(url, archive_dir=archive_dir, notes=notes)
if tgpp_archives_exp.search(urls[0]):
collect_from_url(urls, archive_dir=archive_dir, notes=notes)
elif ieee_archives_exp.search(urls[0]):
collect_from_url(urls, archive_dir=archive_dir, notes=notes)
else:
for url in urls:
collect_from_url(url, archive_dir=archive_dir, notes=notes)


def get_list_name(url):
Expand Down Expand Up @@ -259,7 +268,7 @@ def access_provenance(directory):
file_path = os.path.join(directory, PROVENANCE_FILENAME)
if os.path.isfile(file_path): # a provenance file already exists
file_handle = open(file_path, "r")
provenance = yaml.load(file_handle)
provenance = yaml.safe_load(file_handle)
return provenance
return None

Expand All @@ -273,24 +282,42 @@ def update_provenance(directory, provenance):
file_handle.close()


def collect_archive_from_url(url, archive_dir=CONFIG.mail_path, notes=None):
def collect_archive_from_url(
url: Union[list, str], archive_dir=CONFIG.mail_path, notes=None,
):
"""
Collect archives (generally tar.gz) files from mailmain archive page.
Return True if archives were downloaded, False otherwise
(for example if the page lists no accessible archive files).
"""
list_name = get_list_name(url)
logging.info("Getting archive page for %s", list_name)
if isinstance(url, str):
list_name = get_list_name(url)
logging.info("Getting archive page for %s", list_name)
elif isinstance(url, list):
urls = url
url = url[0]
url_root = "https://" + urlparse(url).hostname

if w3c_archives_exp.search(url):
return w3crawl.collect_from_url(url, archive_dir, notes=notes)
elif listserv_archives_exp.search(url):
listserv.ListservArchive.from_url(
elif tgpp_archives_exp.search(url):
return listserv.ListservArchive.from_mailing_lists(
name="3GPP",
url_root=url,
url_home=url + "HOME",
instant_dump=True,
url_root=url_root,
url_mailing_lists=urls,
login={'username': '...', 'password': '...'},
only_mlist_urls=False,
instant_save=True,
)
elif ieee_archives_exp.search(url):
return listserv.ListservArchive.from_mailing_lists(
name="IEEE",
url_root=url_root,
url_mailing_lists=urls,
login={'username': '...', 'password': '...'},
only_mlist_urls=False,
instant_save=True,
)

response = urllib.request.urlopen(url)
Expand Down
2 changes: 1 addition & 1 deletion config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
base_loc = os.path.abspath(os.path.join(file_path, os.pardir)) # parent directory of config directory
config_filepath = os.path.join(base_loc, "config", "config.yml")
stream = open(config_filepath, "r")
dictionary = yaml.load(stream)
dictionary = yaml.safe_load(stream)

class Config(object):
def __init__(self, conf):
Expand Down

0 comments on commit 304dd26

Please sign in to comment.