Skip to content

pdb

Pdb

Very basic PDB parser for crystallographic and atomic data.

Source code in chmpy/fmt/pdb.py
class Pdb:
    """
    Very basic PDB parser for crystallographic and atomic data.
    """

    def __init__(self, pdb_data):
        self.data = pdb_data
        self.content_lines = []
        self.header = None
        self.unit_cell = None
        self.space_group = None
        self.z = None
        self.atoms = None
        self.line_index = 0

    def parse_crystal_line(self, line):
        """Parse CRYST1 record containing unit cell parameters."""
        line_length = len(line)
        self.unit_cell = {
            "a": float(line[6:14]),
            "b": float(line[15:23]),
            "c": float(line[24:32]),
            "alpha": float(line[33:39]),
            "beta": float(line[40:46]),
            "gamma": float(line[47:53]),
        }
        self.space_group = "P 1"  # default
        if line_length > 55:
            self.space_group = line[55 : min(line_length, 65)].strip()
        self.z = 1  # default
        if line_length > 66:
            self.z = int(line[66 : min(line_length, 69)])

    def parse_atom_lines(self):
        """Parse ATOM and HETATM records."""
        # Initialize atom data structure
        self.atoms = {
            "serial": [],
            "name": [],
            "alt_loc": [],
            "res_name": [],
            "chain_id": [],
            "res_seq": [],
            "icode": [],
            "x": [],
            "y": [],
            "z": [],
            "occupancy": [],
            "temp_factor": [],
            "element": [],
            "charge": [],
        }

        # Process all atom lines until we hit a non-atom record or end of file
        total_lines = len(self.content_lines)

        while self.line_index < total_lines:
            line = self.content_lines[self.line_index]

            # Check for MODEL/ENDMDL records (handle multiple models)
            if line.startswith("MODEL "):
                self.line_index += 1
                continue
            elif line.startswith("ENDMDL") or line.startswith("END"):
                self.line_index += 1
                break

            # Process atom records
            if line[:6] in ("ATOM  ", "HETATM"):
                try:
                    self.atoms["serial"].append(int(line[6:11]))
                    self.atoms["name"].append(line[12:16].strip())
                    self.atoms["alt_loc"].append(line[16])
                    self.atoms["res_name"].append(line[17:20].strip())
                    self.atoms["chain_id"].append(line[21])
                    self.atoms["res_seq"].append(int(line[22:26]))
                    self.atoms["icode"].append(line[26])
                    self.atoms["x"].append(float(line[30:38]))
                    self.atoms["y"].append(float(line[38:46]))
                    self.atoms["z"].append(float(line[46:54]))
                    self.atoms["occupancy"].append(float(line[54:60]))
                    self.atoms["temp_factor"].append(float(line[60:66]))
                    self.atoms["element"].append(line[76:78].strip())

                    # Parse charge safely
                    chg = line[78:80].strip()
                    self.atoms["charge"].append(float(chg[::-1]) if chg else 0.0)
                except (ValueError, IndexError) as e:
                    # Handle malformed lines gracefully
                    print(
                        f"Warning: Error parsing atom at line {self.line_index + 1}: {e}"
                    )
            else:
                # If we hit a non-atom record, break the parsing loop
                break

            self.line_index += 1

    def parse_header(self):
        """Parse the PDB header to find crystallographic information."""
        total_lines = len(self.content_lines)

        while self.line_index < total_lines:
            line = self.content_lines[self.line_index]
            record_type = line[:6]

            if record_type == "CRYST1":
                self.parse_crystal_line(line)
                self.line_index += 1
            elif record_type == "HEADER":
                self.header = line[10:].strip()
                self.line_index += 1
            elif record_type in ("ATOM  ", "HETATM", "MODEL "):
                # We've reached the atom section
                break
            else:
                # Skip other header records
                self.line_index += 1

        # Debug info - print crystallographic data
        if self.unit_cell:
            print(self.unit_cell)
            print(self.space_group)
            print(self.z)

    def parse(self):
        """Parse the entire PDB contents."""
        self.line_index = 0

        # Parse header section
        self.parse_header()

        # Parse atom records
        self.parse_atom_lines()

        # Skip any remaining records
        while self.line_index < len(self.content_lines):
            self.line_index += 1

        return self.data

    @classmethod
    def from_file(cls, filename):
        """Initialize a PDB parser from a file path."""
        return cls.from_string(Path(filename).read_text())

    @classmethod
    def from_string(cls, contents):
        """Initialize a PDB parser from string contents."""
        pdb = cls({})
        pdb.content_lines = [line.strip() for line in contents.split("\n")]
        pdb.parse()
        return pdb

from_file(filename) classmethod

Initialize a PDB parser from a file path.

Source code in chmpy/fmt/pdb.py
@classmethod
def from_file(cls, filename):
    """Initialize a PDB parser from a file path."""
    return cls.from_string(Path(filename).read_text())

from_string(contents) classmethod

Initialize a PDB parser from string contents.

Source code in chmpy/fmt/pdb.py
@classmethod
def from_string(cls, contents):
    """Initialize a PDB parser from string contents."""
    pdb = cls({})
    pdb.content_lines = [line.strip() for line in contents.split("\n")]
    pdb.parse()
    return pdb

parse()

Parse the entire PDB contents.

Source code in chmpy/fmt/pdb.py
def parse(self):
    """Parse the entire PDB contents."""
    self.line_index = 0

    # Parse header section
    self.parse_header()

    # Parse atom records
    self.parse_atom_lines()

    # Skip any remaining records
    while self.line_index < len(self.content_lines):
        self.line_index += 1

    return self.data

parse_atom_lines()

Parse ATOM and HETATM records.

Source code in chmpy/fmt/pdb.py
def parse_atom_lines(self):
    """Parse ATOM and HETATM records."""
    # Initialize atom data structure
    self.atoms = {
        "serial": [],
        "name": [],
        "alt_loc": [],
        "res_name": [],
        "chain_id": [],
        "res_seq": [],
        "icode": [],
        "x": [],
        "y": [],
        "z": [],
        "occupancy": [],
        "temp_factor": [],
        "element": [],
        "charge": [],
    }

    # Process all atom lines until we hit a non-atom record or end of file
    total_lines = len(self.content_lines)

    while self.line_index < total_lines:
        line = self.content_lines[self.line_index]

        # Check for MODEL/ENDMDL records (handle multiple models)
        if line.startswith("MODEL "):
            self.line_index += 1
            continue
        elif line.startswith("ENDMDL") or line.startswith("END"):
            self.line_index += 1
            break

        # Process atom records
        if line[:6] in ("ATOM  ", "HETATM"):
            try:
                self.atoms["serial"].append(int(line[6:11]))
                self.atoms["name"].append(line[12:16].strip())
                self.atoms["alt_loc"].append(line[16])
                self.atoms["res_name"].append(line[17:20].strip())
                self.atoms["chain_id"].append(line[21])
                self.atoms["res_seq"].append(int(line[22:26]))
                self.atoms["icode"].append(line[26])
                self.atoms["x"].append(float(line[30:38]))
                self.atoms["y"].append(float(line[38:46]))
                self.atoms["z"].append(float(line[46:54]))
                self.atoms["occupancy"].append(float(line[54:60]))
                self.atoms["temp_factor"].append(float(line[60:66]))
                self.atoms["element"].append(line[76:78].strip())

                # Parse charge safely
                chg = line[78:80].strip()
                self.atoms["charge"].append(float(chg[::-1]) if chg else 0.0)
            except (ValueError, IndexError) as e:
                # Handle malformed lines gracefully
                print(
                    f"Warning: Error parsing atom at line {self.line_index + 1}: {e}"
                )
        else:
            # If we hit a non-atom record, break the parsing loop
            break

        self.line_index += 1

parse_crystal_line(line)

Parse CRYST1 record containing unit cell parameters.

Source code in chmpy/fmt/pdb.py
def parse_crystal_line(self, line):
    """Parse CRYST1 record containing unit cell parameters."""
    line_length = len(line)
    self.unit_cell = {
        "a": float(line[6:14]),
        "b": float(line[15:23]),
        "c": float(line[24:32]),
        "alpha": float(line[33:39]),
        "beta": float(line[40:46]),
        "gamma": float(line[47:53]),
    }
    self.space_group = "P 1"  # default
    if line_length > 55:
        self.space_group = line[55 : min(line_length, 65)].strip()
    self.z = 1  # default
    if line_length > 66:
        self.z = int(line[66 : min(line_length, 69)])

parse_header()

Parse the PDB header to find crystallographic information.

Source code in chmpy/fmt/pdb.py
def parse_header(self):
    """Parse the PDB header to find crystallographic information."""
    total_lines = len(self.content_lines)

    while self.line_index < total_lines:
        line = self.content_lines[self.line_index]
        record_type = line[:6]

        if record_type == "CRYST1":
            self.parse_crystal_line(line)
            self.line_index += 1
        elif record_type == "HEADER":
            self.header = line[10:].strip()
            self.line_index += 1
        elif record_type in ("ATOM  ", "HETATM", "MODEL "):
            # We've reached the atom section
            break
        else:
            # Skip other header records
            self.line_index += 1

    # Debug info - print crystallographic data
    if self.unit_cell:
        print(self.unit_cell)
        print(self.space_group)
        print(self.z)