diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..65b8b2a --- /dev/null +++ b/.coveragerc @@ -0,0 +1,2 @@ +[run] +parallel = True diff --git a/CHANGELOG.md b/CHANGELOG.md index d40c403..9678a53 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## [0.2.0] - 2020-05-05 + +* add cp2k_bs2csv with support for CP2K v8+ and an API +* add xyz_restart_cleaner + ## [0.1.4] - 2020-04-03 * updated project urls diff --git a/README.md b/README.md index 46978ec..4ecd333 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,12 @@ # cp2k-output-tools -[![Build Status](https://github.com/cp2k/cp2k-output-tools/workflows/tests/badge.svg)](https://github.com/dev-zero/cp2k-output-tools/actions) [![codecov](https://codecov.io/gh/dev-zero/cp2k-output-tools/branch/develop/graph/badge.svg)](https://codecov.io/gh/dev-zero/cp2k-output-tools) [![PyPI](https://img.shields.io/pypi/pyversions/cp2k-output-tools)](https://pypi.org/project/cp2k-output-tools/) +[![Build Status](https://github.com/cp2k/cp2k-output-tools/workflows/tests/badge.svg)](https://github.com/cp2k/cp2k-output-tools/actions) [![codecov](https://codecov.io/gh/cp2k/cp2k-output-tools/branch/develop/graph/badge.svg)](https://codecov.io/gh/cp2k/cp2k-output-tools) [![PyPI](https://img.shields.io/pypi/pyversions/cp2k-output-tools)](https://pypi.org/project/cp2k-output-tools/) -Modular CP2K output file parsers, mostly in the form of regular expressions. +Modular CP2K output file parsers, mostly in the form of regular expressions plus other tools to mangle various CP2K output: + + * `cp2kparse` ... parse CP2K output files (for restart & input files look at the [cp2k-input-tools](https://github.com/cp2k/cp2k-input-tools) project) + * `xyz_restart_parser` ... when restarts occur during an MD you may end up with duplicated frames in the trajectory, this tool filters them + * `cp2k_bs2csv` ... convert a CP2K band structure file to multiple (one-per-set) CSV files for easier plotting. There is also an API available if you need to import bandstructure data into your application. ## Requirements @@ -12,7 +16,7 @@ Modular CP2K output file parsers, mostly in the form of regular expressions. For development: https://poetry.eustace.io/ https://pytest.org/ -## Usage +## Usage: cp2kparse There is a simple command-line interface `cp2kparse`: @@ -493,6 +497,14 @@ with open("calc.out", "r") as fhandle: print(match.values) ``` +## Usage: xyz_restart_cleaner + +```console +$ xyz_restart_cleaner orig_trajectory.xyz new_trajectory.xyz +found restart point @1, dropping 1 frames, flushing 1 +flushing remaining 2 frames +``` + ## Development ```console diff --git a/cp2k_output_tools/__init__.py b/cp2k_output_tools/__init__.py index 8973bd0..658d01d 100644 --- a/cp2k_output_tools/__init__.py +++ b/cp2k_output_tools/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.1.4" +__version__ = "0.2.0" __all__ = ["builtin_matchers", "parse_iter"] diff --git a/cp2k_output_tools/bandstructure.py b/cp2k_output_tools/bandstructure.py new file mode 100755 index 0000000..e593df5 --- /dev/null +++ b/cp2k_output_tools/bandstructure.py @@ -0,0 +1,192 @@ +""" +Convert the CP2K band structure output to CSV files +""" + +__all__ = ["SpecialPoint", "Point", "set_gen"] + +import re +import argparse +from dataclasses import dataclass +from typing import List, Optional +import itertools + + +@dataclass +class SpecialPoint: + number: int + name: str + a: float + b: float + c: float + + +@dataclass +class Point: + a: float + b: float + c: float + bands: List[float] + spin: int + weight: Optional[float] = None + + +SET_MATCH = re.compile( + r""" +[ ]* + SET: [ ]* (?P\d+) [ ]* + TOTAL [ ] POINTS: [ ]* (?P\d+) [ ]* + \n +(?P + [\s\S]*?(?=\n.*?[ ] SET|$) # match everything until next 'SET' or EOL +) +""", + re.VERBOSE, +) + +SPOINTS_MATCH = re.compile( + r""" +[ ]* + POINT [ ]+ (?P\d+) [ ]+ (?P\S+) [ ]+ (?P\S+) [ ]+ (?P\S+) [ ]+ (?P\S+) +""", + re.VERBOSE, +) + +POINTS_MATCH = re.compile( + r""" +[ ]* + Nr\. [ ]+ (?P\d+) [ ]+ + Spin [ ]+ (?P\d+) [ ]+ + K-Point [ ]+ (?P\S+) [ ]+ (?P\S+) [ ]+ (?P\S+) [ ]* + \n +[ ]* (?P\d+) [ ]* \n +(?P + [\s\S]*?(?=\n.*?[ ] Nr|$) # match everything until next 'Nr.' or EOL +) +""", + re.VERBOSE, +) + + +def _specialpoints_gen(content): + for match in SPOINTS_MATCH.finditer(content): + yield SpecialPoint(int(match["number"]), match["name"], float(match["a"]), float(match["b"]), float(match["c"])) + + +def _points_gen(content): + for match in POINTS_MATCH.finditer(content): + yield Point( + a=float(match["a"]), + b=float(match["b"]), + c=float(match["c"]), + bands=[float(v) for v in match["bands"].split()], + spin=int(match["spin"]), + ) + + +SET_MATCH8 = re.compile( + r""" +\#\ Set\ (?P\d+):\ \d+\ special\ points,\ (?P\d+)\ k-points,\ \d+\ bands \s* +(?P + [\s\S]*?(?=\n.*?\#\ Set|$) # match everything until next 'SET' or EOL +) +""", + re.VERBOSE, +) + + +SPOINTS_MATCH8 = re.compile( + r""" +\#\s+ Special\ point\ (?P\d+) \s+ (?P\S+) \s+ (?P\S+) \s+ (?P\S+) \s+ (?P\S+) +""", + re.VERBOSE, +) + + +POINTS_MATCH8 = re.compile( + r""" +\#\ \ Point\ (?P\d+)\s+ Spin\ (?P\d+): \s+ (?P\S+) \s+ (?P\S+) \s+ (?P\S+) [ ]* ((?P\S+) [ ]*)? \n +\#\ \ \ Band \s+ Energy\ \[eV\] \s+ Occupation \s* +(?P + [\s\S]*?(?=\n.*?\#\ \ Point|$) # match everything until next '# Point' or EOL +) +""", + re.VERBOSE, +) + + +def _points_gen8(content): + for match in POINTS_MATCH8.finditer(content): + try: + weight = float(match["weight"]) + except TypeError: + weight = None + + values = match["bands"].split() + + yield Point( + a=float(match["a"]), + b=float(match["b"]), + c=float(match["c"]), + bands=[float(v) for v in values[1::3]], + weight=weight, + spin=int(match["spin"]), + ) + + +def _specialpoints_gen8(content): + for match in SPOINTS_MATCH8.finditer(content): + yield SpecialPoint(int(match["number"]), match["name"], float(match["a"]), float(match["b"]), float(match["c"])) + + +def set_gen(content): + # try with the CP2K+8+ regex first + matchiter = SET_MATCH8.finditer(content) + specialpoints_gen = _specialpoints_gen8 + points_gen = _points_gen8 + + try: + peek = next(matchiter) + matchiter = itertools.chain([peek], matchiter) + except StopIteration: + # if nothing could be found, fallback to the older format + matchiter = SET_MATCH.finditer(content) + specialpoints_gen = _specialpoints_gen + points_gen = _points_gen + + for match in matchiter: + yield (int(match["setnr"]), int(match["totalpoints"]), specialpoints_gen(match["content"]), points_gen(match["content"])) + + +def cp2k_bs2csv(): + parser = argparse.ArgumentParser( + description=""" + Convert the input from the given input file handle and write + CSV output files based on the given pattern. + """, + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "bsfile", metavar="", type=argparse.FileType("r"), help="the band structure file generated by CP2K" + ) + parser.add_argument( + "-p", "--output-pattern", help="The output pattern for the different set files", default="{bsfile.name}.set-{setnr}.csv" + ) + args = parser.parse_args() + + content = args.bsfile.read() + + for setnr, totalpoints, specialpoints, points in set_gen(content): + filename = args.output_pattern.format(bsfile=args.bsfile, setnr=setnr) + + with open(filename, "w") as csvout: + print(f"writing point set {filename} (total number of k-points: {totalpoints})") + print("with the following special points:") + + for point in specialpoints: + print(f" {point.name:>8}: {point.a:10.8f} / {point.b:10.8f} / {point.c:10.8f}") + + for point in points: + csvout.write(f"{point.a:10.8f} {point.b:10.8f} {point.c:10.8f}") + for value in point.bands: + csvout.write(f" {value:10.8f}") + csvout.write("\n") diff --git a/cp2k_output_tools/trajectories/xyz.py b/cp2k_output_tools/trajectories/xyz.py new file mode 100644 index 0000000..4242601 --- /dev/null +++ b/cp2k_output_tools/trajectories/xyz.py @@ -0,0 +1,176 @@ +import mmap +import re +import contextlib +from collections.abc import Iterator + + +__all__ = ["parse", "parse_iter", "BlockIterator"] + + +@contextlib.contextmanager +def as_byteorstringlike(fh_or_content): + """ + Yields a tuple (content, content_type), + where content_type is True if content is a unicode string, + and False if it is a byte-like (and needs to be decoded). + """ + + if isinstance(fh_or_content, str): + yield fh_or_content, True + elif isinstance(fh_or_content, bytes): + yield fh_or_content, False + else: + # if the handle is a file handle, use mmap to return a byte-like object + mmapped = mmap.mmap(fh_or_content.fileno(), 0, access=mmap.ACCESS_READ) + + try: + yield mmapped, False + finally: + mmapped.close() + + +# MULTILINE and VERBOSE regex to match coordinate lines in a frame: +POS_MATCH_REGEX = r""" +^ # Linestart +[ \t]* # Optional white space +(?P[A-Za-z]+[A-Za-z0-9]*)\s+ # get the symbol +(?P [\-\+]? (\d*\.\d+|\d+\.?\d*) ([Ee][\+\-]?\d+)? ) [ \t]+ # Get x +(?P [\-\+]? (\d*\.\d+|\d+\.?\d*) ([Ee][\+\-]?\d+)? ) [ \t]+ # Get y +(?P [\-\+]? (\d*\.\d+|\d+\.?\d*) ([Ee][\+\-]?\d+)? ) # Get z +""" + +# MULTILINE and VERBOSE regex to match frames: +FRAME_MATCH_REGEX = r""" + # First line contains an integer + # and only an integer: the number of atoms +^[ \t]* (?P [0-9]+) [ \t]*[\n] # End first line +(?P.*) [\n] # The second line is a comment +(?P # This is the block of positions + ( + ( + \s* # White space in front of the element spec is ok + ( + [A-Za-z]+[A-Za-z0-9]* # Element spec + ( + \s+ # White space in front of the number + [\-\+]? # Plus or minus in front of the number (optional) + (\d* # optional decimal in the beginning .0001 is ok, for example + \. # There has to be a dot followed by + \d+) # at least one decimal + | # OR + (\d+ # at least one decimal, followed by + \.? # an optional dot + \d*) # followed by optional decimals + ([Ee][\+\-]?\d+)? # optional exponents E+03, e-05 + ){3} # I expect three float values + | + \# # If a line is commented out, that is also ok + ) + .* # I do not care what is after the comment or the position spec + | # OR + \s* # A line only containing white space + ) + [\n] # line break at the end + )+ +) # A positions block should be one or more lines +""" + + +class BlockIterator(Iterator): + """ + A iterator for wrapping the iterator returned by `match.finditer` + to extract the required fields directly from the match object + """ + + def __init__(self, it, natoms): + self._it = it + self._natoms = natoms + self._catom = 0 + + def __next__(self): + try: + match = next(self._it) + except StopIteration: + # if we reached the number of atoms declared, everything is well + # and we re-raise the StopIteration exception + if self._catom == self._natoms: + raise + else: + # otherwise we got too less entries + raise ValueError(f"Number of atom entries {self._catom} is smaller than the number of atoms {self._natoms}") + + self._catom += 1 + + if self._catom > self._natoms: + raise TypeError(f"Number of atom entries {self._catom} is larger than the number of atoms {self._natoms}") + + return (match["sym"], (float(match["x"]), float(match["y"]), float(match["z"]))) + + +def parse_iter(fh_or_string): + """Generates nested tuples for frames in XYZ files. + + Args: + string: a string containing XYZ-structured text + + Yields: + tuple: `(natoms, comment, atomiter)` for each frame + in the XYZ data where `atomiter` is an iterator yielding a + nested tuple `(symbol, (x, y, z))` for each entry. + + Raises: + TypeError: If the number of atoms specified for the frame does not match + the number of atom entries in the file. + + Examples: + >>> print(len(list(parse_iter(''' + ... 5 + ... no comment + ... C 5.0000000000 5.0000000000 5.0000000000 + ... H 5.6401052216 5.6401052216 5.6401052216 + ... H 4.3598947806 4.3598947806 5.6401052208 + ... H 4.3598947806 5.6401052208 4.3598947806 + ... H 5.6401052208 4.3598947806 4.3598947806 + ... 5 + ... no comment + ... C 5.0000000000 5.0000000000 5.0000000000 + ... H 5.6401902064 5.6401902064 5.6401902064 + ... H 4.3598097942 4.3598097942 5.6401902063 + ... H 4.3598097942 5.6401902063 4.3598097942 + ... H 5.6401902063 4.3598097942 4.3598097942 + ... 5 + ... no comment + ... C 5.0000000000 5.0000000000 5.0000000000 + ... H 5.6401902064 5.6401902064 5.6401902064 + ... H 4.3598097942 4.3598097942 5.6401902063 + ... H 4.3598097942 5.6401902063 4.3598097942 + ... H 5.6401902063 4.3598097942 4.3598097942 + ... ''')))) + 3 + """ + + with as_byteorstringlike(fh_or_string) as (content, is_string): + if is_string: + frame_match = re.compile(FRAME_MATCH_REGEX, re.MULTILINE | re.VERBOSE) + pos_match = re.compile(POS_MATCH_REGEX, re.MULTILINE | re.VERBOSE) + else: + frame_match = re.compile(FRAME_MATCH_REGEX.encode("utf8"), re.MULTILINE | re.VERBOSE) + pos_match = re.compile(POS_MATCH_REGEX.encode("utf8"), re.MULTILINE | re.VERBOSE) + + for block in frame_match.finditer(content): + natoms = int(block["natoms"]) + yield ( + natoms, + block["comment"] if is_string else block["comment"].decode("utf8"), + BlockIterator(pos_match.finditer(block["positions"]), natoms), + ) + + +def parse(fh_or_string): + """ + The same as parse_iter(...) but instead of iterators, a list of nested dicts containing again + a list for the 'atoms' key instead of another iterator are returned. + """ + return [ + {"natoms": natoms, "comment": comment, "atoms": list(atomiter)} for (natoms, comment, atomiter) in parse_iter(fh_or_string) + ] diff --git a/cp2k_output_tools/trajectories/xyz_cli.py b/cp2k_output_tools/trajectories/xyz_cli.py new file mode 100644 index 0000000..0ba6039 --- /dev/null +++ b/cp2k_output_tools/trajectories/xyz_cli.py @@ -0,0 +1,82 @@ +import sys +import re +import contextlib +import argparse +import mmap + +from .xyz import FRAME_MATCH_REGEX + + +@contextlib.contextmanager +def mmapped(fhandle): + fmapped = mmap.mmap(fhandle.fileno(), 0, access=mmap.ACCESS_READ) + + try: + yield fmapped + finally: + fmapped.close() + + +CP2K_COMMENT_MATCH = re.compile( + r""" +^[ \t]* i [ ] = [ \t]+ (?P \d+), + [ \t]* time [ ] = [ \t]+ (?P