src/parser/pdb-parser.js

/**
 * @file Pdb Parser
 * @author Alexander Rose <alexander.rose@weirdbyte.de>
 * @private
 */

import { Matrix4 } from '../../lib/three.es6.js'

import { Debug, Log, ParserRegistry } from '../globals.js'
import { defaults } from '../utils.js'
import StructureParser from './structure-parser.js'
import Entity from '../structure/entity.js'
import Unitcell from '../symmetry/unitcell.js'
import Assembly from '../symmetry/assembly.js'
import { WaterNames } from '../structure/structure-constants.js'
import {
    assignSecondaryStructure, buildUnitcellAssembly,
    calculateBonds, calculateChainnames, calculateSecondaryStructure
} from '../structure/structure-utils.js'

// PDB helix record encoding
const HelixTypes = {
  1: 'h',  // Right-handed alpha (default)
  2: 'h',  // Right-handed omega
  3: 'i',  // Right-handed pi
  4: 'h',  // Right-handed gamma
  5: 'g',  // Right-handed 310
  6: 'h',  // Left-handed alpha
  7: 'h',  // Left-handed omega
  8: 'h',  // Left-handed gamma
  9: 'h',  // 27 ribbon/helix
  10: 'h',  // Polyproline
  '': 'h'
}

const dAminoAcids = [
  'DAL',  // D-ALANINE
  'DAR',  // D-ARGININE
  'DSG',  // D-ASPARAGINE
  'DAS',  // D-ASPARTIC ACID
  'DCY',  // D-CYSTEINE
  'DGL',  // D-GLUTAMIC ACID
  'DGN',  // D-GLUTAMINE
  'DHI',  // D-HISTIDINE
  'DIL',  // D-ISOLEUCINE
  'DLE',  // D-LEUCINE
  'DLY',  // D-LYSINE
  'MED',  // D-METHIONINE
  'DPN',  // D-PHENYLALANINE
  'DPR',  // D-PROLINE
  'DSN',  // D-SERINE
  'DTH',  // D-THREONINE
  'DTR',  // D-TRYPTOPHAN
  'DTY',  // D-TYROSINE
  'DVA',  // D-VALINE

  'DNE'  // D-NORLEUCINE

  // ???  // D-SELENOCYSTEINE
]

const entityKeyList = [
  'MOL_ID', 'MOLECULE', 'CHAIN', 'FRAGMENT', 'SYNONYM',
  'EC', 'ENGINEERED', 'MUTATION', 'OTHER_DETAILS'
]

const reWhitespace = /\s+/

function getModresId (resno, chainname, inscode) {
  let id = `${resno}`
  if (chainname) id += `:${chainname}`
  if (inscode) id += `^${inscode}`
  return id
}

class PdbParser extends StructureParser {
  /**
   * Create a pdb parser
   * @param  {Streamer} streamer - streamer object
   * @param  {Object} params - params object
   * @param  {Boolean} params.hex - hexadecimal parsing of
   *                                atom numbers >99.999 and
   *                                residue numbers >9.999
   * @return {undefined}
   */
  constructor (streamer, params) {
    const p = params || {}

    super(streamer, p)

    this.hex = defaults(p.hex, false)
  }

  get type () { return 'pdb' }

  _parse () {
    // http://www.wwpdb.org/documentation/file-format.php

    if (Debug) Log.time('PdbParser._parse ' + this.name)

    let isLegacy = false
    const headerLine = this.streamer.peekLines(1)[ 0 ]
    const headerId = headerLine.substr(62, 4)
    const legacyId = headerLine.substr(72, 4)
    if (headerId === legacyId && legacyId.trim()) {
      isLegacy = true
    }

    const isPqr = this.type === 'pqr'

    const s = this.structure
    const sb = this.structureBuilder

    const hex = this.hex
    let serialRadix = 10
    let resnoRadix = 10

    const firstModelOnly = this.firstModelOnly
    const asTrajectory = this.asTrajectory
    const cAlphaOnly = this.cAlphaOnly

    const frames = s.frames
    const boxes = s.boxes
    let doFrames = false
    let currentFrame, currentCoord

    const biomolDict = s.biomolDict
    let currentBiomol
    let currentPart
    let currentMatrix

    let line, recordName
    let serial, chainname, resno, resname, occupancy
    let inscode, atomname, hetero, bfactor, altloc

    let startChain, startResi, startIcode
    let endChain, endResi, endIcode

    let serialDict = {}
    const unitcellDict = {}
    const bondDict = {}

    const entityDataList = []
    let currentEntityData
    let currentEntityKey
    // MOL_ID                 Numbers each component; also used in  SOURCE to associate
    //                        the information.
    // MOLECULE               Name of the macromolecule.
    // CHAIN                  Comma-separated list of chain  identifier(s).
    // FRAGMENT               Specifies a domain or region of the  molecule.
    // SYNONYM                Comma-separated list of synonyms for  the MOLECULE.
    // EC                     The Enzyme Commission number associated  with the molecule.
    //                        If there is more than one EC number,  they are presented
    //                        as a comma-separated list.
    // ENGINEERED             Indicates that the molecule was  produced using
    //                        recombinant technology or by purely  chemical synthesis.
    // MUTATION               Indicates if there is a mutation.
    // OTHER_DETAILS          Additional comments.

    const hetnameDict = {}
    const modresDict = {}

    const chainDict = {}
    let chainIdx, chainid, newChain
    let currentChainname, currentResno, currentResname, currentInscode

    const seqresDict = {}
    let currentSeqresChainname

    const secStruct = {
      helices: [],
      sheets: []
    }
    const helices = secStruct.helices
    const sheets = secStruct.sheets

    const atomMap = s.atomMap
    const atomStore = s.atomStore
    atomStore.resize(Math.round(this.streamer.data.length / 80))
    if (isPqr) {
      atomStore.addField('partialCharge', 1, 'float32')
      atomStore.addField('radius', 1, 'float32')
    }

    const ap1 = s.getAtomProxy()
    const ap2 = s.getAtomProxy()

    let idx = 0
    let modelIdx = 0
    let pendingStart = true

    function _parseChunkOfLines (_i, _n, lines) {
      for (let i = _i; i < _n; ++i) {
        line = lines[ i ]
        recordName = line.substr(0, 6)

        if (recordName === 'ATOM  ' || recordName === 'HETATM') {
          // http://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
          // PQR: Field_name Atom_number Atom_name Residue_name Chain_ID Residue_number X Y Z Charge Radius

          if (pendingStart) {
            if (asTrajectory) {
              if (doFrames) {
                currentFrame = new Float32Array(atomStore.count * 3)
                frames.push(currentFrame)
              } else {
                currentFrame = []
              }
              currentCoord = 0
            } else {
              if (!firstModelOnly) serialDict = {}
            }

            chainIdx = 1
            chainid = chainIdx.toString()
            newChain = true

            pendingStart = false
          }

          if (firstModelOnly && modelIdx > 0) continue

          let x, y, z, ls, dd

          if (isPqr) {
            ls = line.split(reWhitespace)
            dd = ls.length === 10 ? 1 : 0

            atomname = ls[ 2 ]
            if (cAlphaOnly && atomname !== 'CA') continue

            x = parseFloat(ls[ 6 - dd ])
            y = parseFloat(ls[ 7 - dd ])
            z = parseFloat(ls[ 8 - dd ])
          } else {
            atomname = line.substr(12, 4).trim()
            if (cAlphaOnly && atomname !== 'CA') continue

            x = parseFloat(line.substr(30, 8))
            y = parseFloat(line.substr(38, 8))
            z = parseFloat(line.substr(46, 8))
          }

          if (asTrajectory) {
            const j = currentCoord * 3

            currentFrame[ j + 0 ] = x
            currentFrame[ j + 1 ] = y
            currentFrame[ j + 2 ] = z

            currentCoord += 1

            if (doFrames) continue
          }

          let element

          if (isPqr) {
            serial = parseInt(ls[ 1 ])
            element = ''
            hetero = (line[ 0 ] === 'H') ? 1 : 0
            chainname = dd ? '' : ls[ 4 ]
            resno = parseInt(ls[ 5 - dd ])
            inscode = ''
            resname = ls[ 3 ]
            altloc = ''
            occupancy = 0.0
          } else {
            serial = parseInt(line.substr(6, 5), serialRadix)
            if (hex && serial === 99999) {
              serialRadix = 16
            }
            hetero = (line[ 0 ] === 'H') ? 1 : 0
            chainname = line[ 21 ].trim()
            resno = parseInt(line.substr(22, 4), resnoRadix)
            if (hex && resno === 9999) {
              resnoRadix = 16
            }
            inscode = line[ 26 ].trim()
            resname = line.substr(17, 4).trim() || 'MOL'
            bfactor = parseFloat(line.substr(60, 6))
            altloc = line[ 16 ].trim()
            occupancy = parseFloat(line.substr(54, 6))

            if (!isLegacy) {
              element = line.substr(76, 2).trim()
              if (!chainname) {
                chainname = line.substr(72, 4).trim()  // segid
              }
            }
          }

          atomStore.growIfFull()
          atomStore.atomTypeId[ idx ] = atomMap.add(atomname, element)

          atomStore.x[ idx ] = x
          atomStore.y[ idx ] = y
          atomStore.z[ idx ] = z
          atomStore.serial[ idx ] = serial
          atomStore.altloc[ idx ] = altloc.charCodeAt(0)
          atomStore.occupancy[ idx ] = isNaN(occupancy) ? 0 : occupancy

          if (isPqr) {
            atomStore.partialCharge[ idx ] = parseFloat(ls[ 9 - dd ])
            atomStore.radius[ idx ] = parseFloat(ls[ 10 - dd ])
          } else {
            atomStore.bfactor[ idx ] = isNaN(bfactor) ? 0 : bfactor
          }

          const modresId = getModresId(resno, chainname, inscode)

          // TODO instead of looking at MODRES look at SEQRES and
          //      missing residues in REMARK 465
          if (hetero && !modresDict[modresId] && !dAminoAcids.includes(resname)) {
            if (currentChainname !== chainname || currentResname !== resname ||
                (!WaterNames.includes(resname) &&
                  (currentResno !== resno || currentInscode !== inscode))
            ) {
              chainIdx += 1
              chainid = chainIdx.toString()

              currentResno = resno
              currentResname = resname
              currentInscode = inscode
            }
          } else if (!newChain && currentChainname !== chainname) {
            chainIdx += 1
            chainid = chainIdx.toString()
          }

          sb.addAtom(modelIdx, chainname, chainid, resname, resno, hetero, undefined, inscode)

          serialDict[ serial ] = idx
          idx += 1
          newChain = false
          currentChainname = chainname
        } else if (recordName === 'CONECT') {
          const fromIdx = serialDict[ parseInt(line.substr(6, 5)) ]
          const pos = [ 11, 16, 21, 26 ]
          const bondIndex = {}

          if (fromIdx === undefined) {
            // Log.log( "missing CONNECT serial" );
            continue
          }

          for (let j = 0; j < 4; ++j) {
            let toIdx = parseInt(line.substr(pos[ j ], 5))
            if (Number.isNaN(toIdx)) continue
            toIdx = serialDict[ toIdx ]
            if (toIdx === undefined) {
              // Log.log( "missing CONNECT serial" );
              continue
            }/* else if( toIdx < fromIdx ){
                // likely a duplicate in standard PDB format
                // but not necessarily, so better remove duplicates
                // in a pass after parsing (and auto bonding)
                continue;
            } */

            if (fromIdx < toIdx) {
              ap1.index = fromIdx
              ap2.index = toIdx
            } else {
              ap1.index = toIdx
              ap2.index = fromIdx
            }

            // interpret records where a 'toIdx' atom is given multiple times
            // as double/triple bonds, e.g. CONECT 1529 1528 1528 is a double bond
            if (bondIndex[ toIdx ] !== undefined) {
              s.bondStore.bondOrder[ bondIndex[ toIdx ] ] += 1
            } else {
              const hash = ap1.index + '|' + ap2.index
              if (bondDict[ hash ] === undefined) {
                bondDict[ hash ] = true
                bondIndex[ toIdx ] = s.bondStore.count
                s.bondStore.addBond(ap1, ap2, 1)  // start/assume with single bond
              }
            }
          }
        } else if (recordName === 'HELIX ') {
          startChain = line[ 19 ].trim()
          startResi = parseInt(line.substr(21, 4))
          startIcode = line[ 25 ].trim()
          endChain = line[ 31 ].trim()
          endResi = parseInt(line.substr(33, 4))
          endIcode = line[ 37 ].trim()
          let helixType = parseInt(line.substr(39, 1))
          helixType = (HelixTypes[ helixType ] || HelixTypes['']).charCodeAt(0)
          helices.push([
            startChain, startResi, startIcode,
            endChain, endResi, endIcode,
            helixType
          ])
        } else if (recordName === 'SHEET ') {
          startChain = line[ 21 ].trim()
          startResi = parseInt(line.substr(22, 4))
          startIcode = line[ 26 ].trim()
          endChain = line[ 32 ].trim()
          endResi = parseInt(line.substr(33, 4))
          endIcode = line[ 37 ].trim()
          sheets.push([
            startChain, startResi, startIcode,
            endChain, endResi, endIcode
          ])
        } else if (recordName === 'HETNAM') {
          hetnameDict[ line.substr(11, 3) ] = line.substr(15).trim()
        } else if (recordName === 'SEQRES') {
          const seqresChainname = line[11].trim()
          if (seqresChainname !== currentSeqresChainname) {
            seqresDict[ seqresChainname ] = []
            currentSeqresChainname = seqresChainname
          }
          seqresDict[ seqresChainname ].push(
            ...line.substr(19).trim().split(reWhitespace)
          )
        } else if (recordName === 'MODRES') {
          // MODRES 2SRC PTR A  527  TYR  O-PHOSPHOTYROSINE
          const resname = line.substr(12, 3).trim()
          const chainname = line[16].trim()
          const inscode = line[22].trim()
          const resno = parseInt(line.substr(18, 4).trim())
          const id = getModresId(resno, chainname, inscode)
          modresDict[ id ] = { resname, chainname, inscode, resno }
        } else if (recordName === 'COMPND') {
          const comp = line.substr(10, 70).trim()
          const keyEnd = comp.indexOf(':')
          const key = comp.substring(0, keyEnd)
          let value

          if (entityKeyList.includes(key)) {
            currentEntityKey = key
            value = comp.substring(keyEnd + 2)
          } else {
            value = comp
          }
          value = value.replace(/;$/, '')

          if (currentEntityKey === 'MOL_ID') {
            currentEntityData = {
              chainList: [],
              name: ''
            }
            entityDataList.push(currentEntityData)
          } else if (currentEntityKey === 'MOLECULE') {
            if (currentEntityData.name) currentEntityData.name += ' '
            currentEntityData.name += value
          } else if (currentEntityKey === 'CHAIN') {
            Array.prototype.push.apply(
              currentEntityData.chainList,
              value.split(/\s*,\s*/)
            )
          }
        } else if (line.startsWith('TER')) {
          const cp = s.getChainProxy(s.chainStore.count - 1)
          chainDict[ cp.chainname ] = cp.index
          chainIdx += 1
          chainid = chainIdx.toString()
          newChain = true
        } else if (recordName === 'REMARK' && line.substr(7, 3) === '350') {
          if (line.substr(11, 12) === 'BIOMOLECULE:') {
            let name = line.substr(23).trim()
            if (/^(0|[1-9][0-9]*)$/.test(name)) name = 'BU' + name

            currentBiomol = new Assembly(name)
            biomolDict[ name ] = currentBiomol
          } else if (line.substr(13, 5) === 'BIOMT') {
            const biomt = line.split(/\s+/)
            const row = parseInt(line[ 18 ]) - 1

            if (row === 0) {
              currentMatrix = new Matrix4()
              currentPart.matrixList.push(currentMatrix)
            }

            const biomtElms = currentMatrix.elements

            biomtElms[ 4 * 0 + row ] = parseFloat(biomt[ 4 ])
            biomtElms[ 4 * 1 + row ] = parseFloat(biomt[ 5 ])
            biomtElms[ 4 * 2 + row ] = parseFloat(biomt[ 6 ])
            biomtElms[ 4 * 3 + row ] = parseFloat(biomt[ 7 ])
          } else if (
            line.substr(11, 30) === 'APPLY THE FOLLOWING TO CHAINS:' ||
            line.substr(11, 30) === '                   AND CHAINS:'
          ) {
            if (line.substr(11, 5) === 'APPLY') {
              currentPart = currentBiomol.addPart()
            }

            const chainList = line.substr(41, 30).split(',')
            for (let j = 0, jl = chainList.length; j < jl; ++j) {
              const c = chainList[ j ].trim()
              if (c) currentPart.chainList.push(c)
            }
          }
        } else if (recordName === 'HEADER') {
          s.id = line.substr(62, 4)
        } else if (recordName === 'TITLE ') {
          s.title += (s.title ? ' ' : '') + line.substr(10, 70).trim()
        } else if (recordName === 'MODEL ') {
          pendingStart = true
        } else if (recordName === 'ENDMDL' || line.startsWith('END')) {
          if (pendingStart) continue

          if (asTrajectory && !doFrames) {
            frames.push(new Float32Array(currentFrame))
            doFrames = true
          }

          modelIdx += 1
          pendingStart = true
        } else if (line.substr(0, 5) === 'MTRIX') {
          // ignore 'given' operators
          if (line[ 59 ] === '1') continue

          if (!currentBiomol || currentBiomol.name !== 'NCS') {
            const ncsName = 'NCS'
            currentBiomol = new Assembly(ncsName)
            biomolDict[ ncsName ] = currentBiomol
            currentPart = currentBiomol.addPart()
          }

          const ncs = line.split(/\s+/)
          const ncsRow = parseInt(line[ 5 ]) - 1

          if (ncsRow === 0) {
            currentMatrix = new Matrix4()
            currentPart.matrixList.push(currentMatrix)
          }

          const ncsElms = currentMatrix.elements

          ncsElms[ 4 * 0 + ncsRow ] = parseFloat(ncs[ 2 ])
          ncsElms[ 4 * 1 + ncsRow ] = parseFloat(ncs[ 3 ])
          ncsElms[ 4 * 2 + ncsRow ] = parseFloat(ncs[ 4 ])
          ncsElms[ 4 * 3 + ncsRow ] = parseFloat(ncs[ 5 ])
        } else if (line.substr(0, 5) === 'ORIGX') {
          if (!unitcellDict.origx) {
            unitcellDict.origx = new Matrix4()
          }

          const orgix = line.split(/\s+/)
          const origxRow = parseInt(line[ 5 ]) - 1
          const origxElms = unitcellDict.origx.elements

          origxElms[ 4 * 0 + origxRow ] = parseFloat(orgix[ 1 ])
          origxElms[ 4 * 1 + origxRow ] = parseFloat(orgix[ 2 ])
          origxElms[ 4 * 2 + origxRow ] = parseFloat(orgix[ 3 ])
          origxElms[ 4 * 3 + origxRow ] = parseFloat(orgix[ 4 ])
        } else if (line.substr(0, 5) === 'SCALE') {
          if (!unitcellDict.scale) {
            unitcellDict.scale = new Matrix4()
          }

          const scale = line.split(/\s+/)
          const scaleRow = parseInt(line[ 5 ]) - 1
          const scaleElms = unitcellDict.scale.elements

          scaleElms[ 4 * 0 + scaleRow ] = parseFloat(scale[ 1 ])
          scaleElms[ 4 * 1 + scaleRow ] = parseFloat(scale[ 2 ])
          scaleElms[ 4 * 2 + scaleRow ] = parseFloat(scale[ 3 ])
          scaleElms[ 4 * 3 + scaleRow ] = parseFloat(scale[ 4 ])
        } else if (recordName === 'CRYST1') {
          // CRYST1   55.989   55.989   55.989  90.00  90.00  90.00 P 1           1
          //  7 - 15       Real(9.3)      a (Angstroms)
          // 16 - 24       Real(9.3)      b (Angstroms)
          // 25 - 33       Real(9.3)      c (Angstroms)
          // 34 - 40       Real(7.2)      alpha         alpha (degrees).
          // 41 - 47       Real(7.2)      beta          beta (degrees).
          // 48 - 54       Real(7.2)      gamma         gamma (degrees).
          // 56 - 66       LString        sGroup        Space group.
          // 67 - 70       Integer        z             Z value.

          const aLength = parseFloat(line.substr(6, 9))
          const bLength = parseFloat(line.substr(15, 9))
          const cLength = parseFloat(line.substr(24, 9))

          const alpha = parseFloat(line.substr(33, 7))
          const beta = parseFloat(line.substr(40, 7))
          const gamma = parseFloat(line.substr(47, 7))

          const sGroup = line.substr(55, 11).trim()
          // const zValue = parseInt( line.substr( 66, 4 ) );

          const box = new Float32Array(9)
          box[ 0 ] = aLength
          box[ 4 ] = bLength
          box[ 8 ] = cLength
          boxes.push(box)

          if (modelIdx === 0) {
            unitcellDict.a = aLength
            unitcellDict.b = bLength
            unitcellDict.c = cLength
            unitcellDict.alpha = alpha
            unitcellDict.beta = beta
            unitcellDict.gamma = gamma
            unitcellDict.spacegroup = sGroup
          }
        }
      }
    }

    this.streamer.eachChunkOfLines(function (lines/*, chunkNo, chunkCount */) {
      _parseChunkOfLines(0, lines.length, lines)
    })

    //

    const en = entityDataList.length

    if (en) {
      s.eachChain(function (cp) {
        cp.entityIndex = en
      })

      entityDataList.forEach(function (e, i) {
        const chainIndexList = e.chainList.map(function (chainname) {
          return chainDict[ chainname ]
        })
        s.entityList.push(new Entity(
          s, i, e.name, 'polymer', chainIndexList
        ))
      })

      let ei = entityDataList.length
      const rp = s.getResidueProxy()
      const residueDict = {}

      s.eachChain(function (cp) {
        if (cp.entityIndex === en) {
          rp.index = cp.residueOffset
          if (!residueDict[ rp.resname ]) {
            residueDict[ rp.resname ] = []
          }
          residueDict[ rp.resname ].push(cp.index)
        }
      })

      Object.keys(residueDict).forEach(function (resname) {
        const chainList = residueDict[ resname ]
        let type = 'non-polymer'
        let name = hetnameDict[ resname ] || resname
        if (WaterNames.includes(resname)) {
          name = 'water'
          type = 'water'
        }
        s.entityList.push(new Entity(
          s, ei, name, type, chainList
        ))
        ei += 1
      })
    }

    //

    if (unitcellDict.a !== undefined) {
      s.unitcell = new Unitcell(unitcellDict)
    } else {
      s.unitcell = undefined
    }

    if (helices.length || sheets.length) {
      assignSecondaryStructure(s, secStruct)
    }

    sb.finalize()
    s.finalizeAtoms()
    if (!isLegacy) calculateChainnames(s)
    calculateBonds(s)
    s.finalizeBonds()

    if (!helices.length && !sheets.length) {
      calculateSecondaryStructure(s)
    }
    buildUnitcellAssembly(s)

    if (Debug) Log.timeEnd('PdbParser._parse ' + this.name)
  }
}

ParserRegistry.add('pdb', PdbParser)
ParserRegistry.add('pdb1', PdbParser)
ParserRegistry.add('ent', PdbParser)

export default PdbParser

export {
  HelixTypes
}