/**
 * @fileoverview Utilities related to molecular formats
 */

/**
 * Regular expression for one atom line in a mol block.
 * Produces named capturing group for the main components:
 *     ```x, y, z, elem, numbers```.
 * In addition, to support access of the mapindex field, it also captures subgroups of `numbers`:
 *     ```first9numbers, mapindex, last2numbers```.
 * @type {RegExp}
 */
export const MolAtomLineRegex = getMolAtomLineRegex();

//--- openbabel supports 88 file types, we might as well support them all!
const supportedMoltypes = ['sdf', 'pdb', 'smi', 'inchi', 'cdx', 'mol2', 'cif', 'mol', 'sd', 'xyz', 'cdxml', 'cml'];
const supportedProteinTypes = ['pdb', 'cif'];
const twoDformats = ['smi', 'cdx', 'cdxml', 'inchi'];
const binaryFormats = ['cdx'];
const proteinExportFormats = ['pdb', 'mol2'];

export function getSupportedMoltypes() {
    return supportedMoltypes;
}

export function isSupportedMoltype(moltype) {
    return supportedMoltypes.includes(moltype);
}

export function getSupportedProteinTypes() {
    return supportedProteinTypes;
}

export function isSupportedProteinType(type) {
    return supportedProteinTypes.includes(type);
}

export function is2dFormat(format) {
    return twoDformats.includes(format.toLowerCase());
}

export function isBinaryFormat(format) {
    return binaryFormats.includes(format.toLowerCase());
}

export function getProteinExportFormats() { return proteinExportFormats; }
export function isProteinExportFormat(format) {
    return proteinExportFormats.includes(format.toLowerCase());
}

/**
 * SMILES strings can include the molecule name after the structure, separated by whitespace.
 * Sometimes we want only the structure itself, so we need to strip off the whitespace
 * and the molecule name.
 * @param {string} smilesStr
 * @returns {string}
 */
export function removeSmilesMolName(smilesStr) {
    const match = typeof (smilesStr) === 'string' && smilesStr.match(/([^\s]+)[\s]?.*/);
    return (match && match[1]) || null;
}

/**
 * Create a smiles string with a name, replacing an old name if there is one.
 * @param {string} smilesStr
 * @param {string} name
 * @returns{string}
 */
export function makeNamedSmiles(smilesStr, name) {
    const unnamedSmiles = removeSmilesMolName(smilesStr);
    if (unnamedSmiles && name) {
        return `${unnamedSmiles}\t${name}`;
    } else {
        return smilesStr;
    }
}

/**
 * Determine if a molecule that supports 3D has only 2D coordinates
 * This only supports checking mol/sdf for all 0s for z coordinates.
 * @param {MolDataSource} molSource
 */
export function needs3D(molSource) {
    if (!(molSource.molData && (molSource.molFormat === 'mol' || molSource.molFormat === 'sdf'))) { return undefined; } // don't know
    return molfileNeeds3D(molSource.molData);
}

// MDL / MOL utilities
export function molfileNeeds3D(molText) {
    // Find more than one set of zero z-axis coords of 0.
    // Don't let this search go on forever for the sake of big files.
    // Match z coord, element char or end of molecule $$$$
    // matchAll returns an iterator.
    const matches = molText.matchAll(/\s+0\.0000\s{1}[A-Z]{1}\s+|\${4}/g);
    let count = 0;
    // Lazy matching bails as soon as it finds 3 (a molecule with <3 will always be planar).
    for (const match of matches) {
        if (match[0].includes('$$$$')) break;
        if (count++ >= 3) return true;
    }
    return false;
}

/**
 * Change the compound name embedded in the mol text (the first line) to the provided name
 * @param {string} molText The original mol string
 * @param {string} compoundName The new compound name
 * @returns {string}
 */
export function replaceNameInMolText(molText, compoundName) {
    const eol = molText.indexOf('\n');
    return (eol > -1 && compoundName)
        ? compoundName.concat(molText.substr(eol))
        : molText;
}

/**
 * Clear the atom mapping indices from a mol block.
 * These indices are in the 3rd to last column of the mol atom lines. In the MolAtomLineRegex,
 * they are captured by the `mapindex` named capturing group.
 * @param {string} molText a molblock from which to remove atom mapping indices
 * @returns {string} The molblock without atom mapping indices
 */
export function removeAtomMappingFromMolText(molText) {
    const atomLineRe = new RegExp(MolAtomLineRegex, 'gm');
    const zeroMapIndex = '  0';
    const newMolText = molText.replace(atomLineRe, `$<x>$<y>$<z> $<element>$<first9numbers>${zeroMapIndex}$<last2numbers>`);
    return newMolText;
}

/**
 * Form a molfile atom line from components.
 * Minimum: coords, element, numbers strings
 * This can also receive x,y,z numbers which will be correctly formatted into coordinates.
 * @param {{
 *     coords: string?,
 *     x: number?, y: number?, z: number?,
 *     element: string,
 *     numbers: string,
 * }} param0
 * @returns {string}
 */
export function formMolAtomLine({
    coords: coordsIn, x, y, z,
    element,
    numbers,
}) {
    const coords = coordsIn || [x, y, z].map((n) => Number(n).toFixed(4).padStart(10, ' ')).join('');
    return `${coords} ${element}${numbers}`;
}

/**
 * Return a regular expression for one atom line in a mol block.
 * This produces named capturing group for the main components:
 *     ```x, y, z, elem, numbers```.
 * In addition, to support access of the mapindex field, it also captures subgroups of `numbers`:
 *     ```first9numbers, mapindex, last2numbers```.
 * @returns {RegExp}
 *
 * Mol atom line pattern: xxxxx.xxxxyyyyy.yyyyzzzzz.zzzz aaaddcccssshhhbbbvvvHHHrrriiimmmnnneee
 * Source: MDL (mol, sdf) Format Spec.pdf, located in:
 * https://www.dropbox.com/home/Conifer%20Point/Software%20Development/External%20Documentation
 * x,y,z coordinates; aaa Atom symbol
 * dd mass difference; ccc charge; sss atom stereo parity; hhh H count; bbb stereo care box
 * vvv valence; HHH H0 designator; rrr / iii not used; mmm mapping number; nnn inversion flag
 * eee exact change flag
 */
function getMolAtomLineRegex() {
    const coordRe = '(?: |-|\\d){5}\\.\\d{4}';
    const eltRe = '[A-Za-z ]{3}';
    const num2Re = '(?:\\d| ){2}';
    const numRe = '(?:\\d| ){3}';
    const coordinates = `(?<x>${coordRe})(?<y>${coordRe})(?<z>${coordRe})`;
    const element = `(?<element>${eltRe})`;
    const numbers = `(?<numbers>(?<first9numbers>${num2Re}${numRe}${numRe}${numRe}${numRe}${numRe}${numRe}${numRe}${numRe})(?<mapindex>${numRe})(?<last2numbers>${numRe}${numRe}))`;
    const molAtomsRe = `^${coordinates} ${element}${numbers}`;
    return new RegExp(molAtomsRe);
}

// SDF Utilities

/**
 * Make an SDF entry for a single molecule, given a mol string and a properties object.
 * @param {string} molblock
 * @param {object} properties
 * @returns {string}
 */
export function makeOneSDF(molblock, properties) {
    const parts = [
        `${molblock.trimEnd()}\n`,
        // SDF props already end with two newlines
        ...Object.entries(properties).map(([name, value]) => oneSdfProp(name, value)),
        '$$$$\n',
    ];
    return parts.join('');
}

/**
 * Format a single SDF property, given a name and value.
 * According to the spec, the value should have a maximum length of 200 characters (not enforced).
 * It's also allowed to add something after the property name's closing bracket.
 * I believe I have seen molecule indices included this way in large SDFs like NP Atlas.
 * This function does not yet support that.
 * @param {string} name
 * @param {string} value
 * @returns {string}
 */
export function oneSdfProp(name, valueIn) {
    // SDF property spec doesn't allow double-newlines, so condense them.
    const value = valueIn.toString()
        .replace(/\r/g, '')
        .replace(/\n\n/g, '\n')
        .trimEnd();
    return `> <${name}>\n${value}\n\n`;
}

/**
 * Produce a csv string
 * @param {object[]} dataRows
 * @param {{ header: string[], colSep: string, rowSep: string }} optionalParams
 */
export function makeCsvString(dataRows, { header=[], colSep=',', rowSep='\n' }={}) {
    // Optional header can be specified in the params, or it could already be in the data rows.
    const rows = (header?.length > 0) ? [header, ...dataRows] : dataRows;

    // Put quotes around values, so spaces and commas don't cause any trouble.
    // Also double-up quotation marks, the CSV way of encoding them.
    const csvQuote = (x) => ((typeof x === 'string') ? `"${x.replace(/"/g, '""')}"` : String(x));
    return rows
        .map((row) => row.map(csvQuote).join(colSep))
        .join(rowSep);
}

/*
 * Attempt to determine the molecular format of a text data representation,
 * based on heuristics of varying rigor.
 *
 * @param {string} data molecule data text
 * @returns {string} the format guess or empty string if it can't find a match
 */
export function guessFormat(data) {
    if (!data) return '';

    // Whole file formats

    // mol / sdf
    if (data.indexOf('V2000') > -1) {
        if (data.indexOf('$$$$') > -1) {
            return 'sdf';
        } else {
            return 'mol';
        }
    }

    // mol2
    if (data.indexOf('@<TRIPOS>') > -1) {
        return 'mol2';
    }

    // pdb / cif
    // This is pretty silly; PDB detection should at least look for 'ATOM  '
    if (data.indexOf('ATOM') > -1 || data.indexOf('HETATM') > -1) {
        const cifRE = /^(_entry\.id|loop_|_entity)/m;
        return cifRE.test(data) ? 'cif' : 'pdb';
    }

    // xyz / cml / cdxml
    if (isXYZ(data)) return 'xyz';
    if (isCML(data)) return 'cml';
    if (isCDXML(data)) return 'cdxml';

    // Line formats

    // smi / inchi
    const lines = extractLines(data);
    if (isInchiGroup(lines)) return 'inchi';
    if (isSmilesGroup(lines)) return 'smi';

    return '';
}

function extractLines(input) {
    // Reminder: don't ever replace \r on binary data!
    const lines = input.replace(/\r/, '').split('\n').filter((line) => line);
    return lines;
}

function isSmilesLine(input) {
    const smilesRE = /^([COHNSPFBIclohnspfbraeui\d()[\]@\-=#+\\/]+)(\s|$)/; // regex attempt for SMILES
    return input.trim().match(smilesRE);
}

function isInchiLine(input) {
    const inchiRE = /^InChI=/;
    return input.trim().match(inchiRE);
}

function isSmilesGroup(input) {
    const lines = typeof input === 'string' ? extractLines(input) : input;
    if (lines.length === 0) return false;
    return lines.every((line) => isSmilesLine(line));
}

function isInchiGroup(input) {
    const lines = typeof input === 'string' ? extractLines(input) : input;
    if (lines.length === 0) return false;
    return lines.every((line) => isInchiLine(line));
}

function isXYZ(input) {
    // xyz format is <atom count>\n<comment>\n<atom lines ...>
    // xyz atom lines each have the format:
    //     <whitespace?><element><spaces><number><spaces><number><spaces><number><whitespace?>
    // Because xyz may be a flexible format, I chose a generous regex for number:
    //     (-?((\d+)|(\d+\.\d+)|(\.\d+)))         This allows for any of: XXX | XXX.YYY | .YYY
    const xyzAtomLineRE = /^(\s*)([A-Za-z]+)(\s+)(-?((\d+)|(\d+\.\d+)|(\.\d+)))(\s+)(-?((\d+)|(\d+\.\d+)|(\.\d+)))(\s+)(-?((\d+)|(\d+\.\d+)|(\.\d+)))(\s*)$/;

    const lines = input.trim().replace(/\r/, '').split('\n');
    const [countLine, /* ignore comment line */, ...atomLines] = lines;
    const count = Number(countLine);
    if (Number.isNaN(count) || atomLines.length === 0) {
        return false;
    }

    return atomLines.every((line) => line.match(xyzAtomLineRE));
}

function isCML(input) {
    const cmlRE = /<molecule/;
    return input.match(cmlRE);
}

function isCDXML(input) {
    const cdxmlRE = /<fragment/;
    return input.match(cdxmlRE);
}
