Skip to content

Commit

Permalink
Fix parsing header lines where values are containing square bracket l…
Browse files Browse the repository at this point in the history
…ists (#107)
  • Loading branch information
cmdcolin authored Dec 8, 2024
1 parent 5ce0ce5 commit 5950b41
Show file tree
Hide file tree
Showing 19 changed files with 1,989 additions and 926 deletions.
42 changes: 17 additions & 25 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,34 +96,26 @@ The `variant` object returned by `parseLine()` would be
DB: true,
XYZ: ['5'],
},
SAMPLES: () => ({
HG00096: {
GT: ['0|0'],
AP: ['0.000', '0.000'],
},
}),
GENOTYPES: () => ({
HG00096: '0|0',
}),
}
```

The `variant` object will also has two methods called "`SAMPLES()`" and
"`GENOTYPES()`" that will not be evaluated unless it is called.

This can save time if you only want the variant information and not the
sample-specific information, especially if your VCF has a lot of samples in it.

In the above case the `variant.SAMPLES()` object would look like

```typescript
{
HG00096: {
GT: ['0|0'],
AP: ['0.000', '0.000'],
},
}
```

whil the `variant.GENOTYPES()` object would look like this (only extracts the GT
as a raw string)
The `variant.SAMPLES()` and `variant.GENOTYPES()` are functions because it does not
try to eagerly parse all the genotype data, so will only do so when you call
either of these which can save time especially if your VCF has a lot of samples
in it.

```typescript
{
HG00096: '0|0'
}
```
The `variant.SAMPLES()` function parses out the FORMAT fields, while
`variant.GENOTYPES()` returns just the genotypes string which can be faster if
that is the only information you are interested in

The parser will try to convert the values in INFO and FORMAT to the proper types
using the header metadata. For example, if there is a header line like
Expand Down Expand Up @@ -316,7 +308,7 @@ Returns **any** An object, string, or number, depending on the filtering
Parse a VCF line into an object like
```json
```typescript
{
CHROM: 'contigA',
POS: 3000,
Expand Down
46 changes: 46 additions & 0 deletions eslint.config.mjs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import eslint from '@eslint/js'
import eslintPluginUnicorn from 'eslint-plugin-unicorn'
import tseslint from 'typescript-eslint'

export default tseslint.config(
Expand All @@ -17,6 +18,7 @@ export default tseslint.config(
...tseslint.configs.recommended,
...tseslint.configs.stylisticTypeChecked,
...tseslint.configs.strictTypeChecked,
eslintPluginUnicorn.configs['flat/recommended'],
{
rules: {
curly: 'error',
Expand All @@ -40,6 +42,50 @@ export default tseslint.config(
'@typescript-eslint/no-unsafe-return': 'off',
'@typescript-eslint/no-non-null-assertion': 'off',
'@typescript-eslint/restrict-template-expressions': 'off',

'unicorn/no-new-array': 'off',
'unicorn/no-empty-file': 'off',
'unicorn/prefer-type-error': 'off',
'unicorn/prefer-modern-math-apis': 'off',
'unicorn/prefer-node-protocol': 'off',
'unicorn/no-unreadable-array-destructuring': 'off',
'unicorn/no-abusive-eslint-disable': 'off',
'unicorn/no-array-callback-reference': 'off',
'unicorn/number-literal-case': 'off',
'unicorn/prefer-add-event-listener': 'off',
'unicorn/prefer-top-level-await': 'off',
'unicorn/consistent-function-scoping': 'off',
'unicorn/no-await-expression-member': 'off',
'unicorn/no-lonely-if': 'off',
'unicorn/consistent-destructuring': 'off',
'unicorn/prefer-module': 'off',
'unicorn/prefer-optional-catch-binding': 'off',
'unicorn/no-useless-undefined': 'off',
'unicorn/no-null': 'off',
'unicorn/no-nested-ternary': 'off',
'unicorn/filename-case': 'off',
'unicorn/catch-error-name': 'off',
'unicorn/prevent-abbreviations': 'off',
'unicorn/prefer-code-point': 'off',
'unicorn/numeric-separators-style': 'off',
'unicorn/no-array-for-each': 'off',
'unicorn/prefer-spread': 'off',
'unicorn/explicit-length-check': 'off',
'unicorn/prefer-regexp-test': 'off',
'unicorn/relative-url-style': 'off',
'unicorn/prefer-math-trunc': 'off',
'unicorn/prefer-query-selector': 'off',
'unicorn/no-negated-condition': 'off',
'unicorn/switch-case-braces': 'off',
'unicorn/prefer-switch': 'off',
'unicorn/better-regex': 'off',
'unicorn/no-for-loop': 'off',
'unicorn/escape-case': 'off',
'unicorn/prefer-number-properties': 'off',
'unicorn/no-process-exit': 'off',
'unicorn/prefer-at': 'off',
'unicorn/prefer-structured-clone': 'off',
'unicorn/prefer-string-replace-all': 'off',
},
},
)
3 changes: 1 addition & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,12 @@
"devDependencies": {
"@babel/core": "^7.20.5",
"@eslint/js": "^9.7.0",
"@types/node": "^22.10.1",
"@typescript-eslint/eslint-plugin": "^8.8.1",
"@typescript-eslint/parser": "^8.8.1",
"@vitest/coverage-v8": "^2.1.3",
"documentation": "^14.0.1",
"eslint": "^9.7.0",
"eslint-config-prettier": "^9.1.0",
"eslint-plugin-prettier": "^5.1.3",
"eslint-plugin-unicorn": "^56.0.0",
"prettier": "^3.2.4",
"rimraf": "^6.0.1",
Expand Down
8 changes: 3 additions & 5 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import VCFParser from './parse'

export interface Breakend {
Join: string
Replacement: string
Expand Down Expand Up @@ -42,7 +40,7 @@ export function parseBreakend(breakendString: string): Breakend | undefined {
return {
Join: 'right',
SingleBreakend: true,
Replacement: breakendString.slice(0, breakendString.length - 1),
Replacement: breakendString.slice(0, -1),
}
} else if (breakendString.startsWith('<')) {
const res = /<(.*)>(.*)/.exec(breakendString)
Expand Down Expand Up @@ -77,6 +75,6 @@ export function parseBreakend(breakendString: string): Breakend | undefined {
return undefined
}

export default VCFParser

export type { Variant } from './parse'

export { default } from './parse'
87 changes: 21 additions & 66 deletions src/parse.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { parseMetaString } from './parseMetaString'
import vcfReserved from './vcfReserved'

function decodeURIComponentNoThrow(uri: string) {
Expand Down Expand Up @@ -35,7 +36,7 @@ export default class VCFParser {
if (!header.length) {
throw new Error('empty header received')
}
const headerLines = header.split(/[\r\n]+/).filter(line => line)
const headerLines = header.split(/[\r\n]+/).filter(Boolean)
if (!headerLines.length) {
throw new Error('no non-empty header lines specified')
}
Expand Down Expand Up @@ -129,7 +130,7 @@ export default class VCFParser {
genotypes[sample] = rest[i++]!
}
} else {
const gtIndex = formatSplit.findIndex(f => f === 'GT')
const gtIndex = formatSplit.indexOf('GT')
if (gtIndex === 0) {
for (const sample of this.samples) {
const val = rest[i++]!
Expand Down Expand Up @@ -171,7 +172,15 @@ export default class VCFParser {
this.metadata[r] = {}
}
const [id, keyVals] = this.parseStructuredMetaVal(metaVal)
;(this.metadata[r] as Record<string, unknown>)[id] = keyVals
if (id) {
// if there is an ID field in the <> metadata
// e.g. ##INFO=<ID=AF_ESP,...>
;(this.metadata[r] as Record<string, unknown>)[id] = keyVals
} else {
// if there is not an ID field in the <> metadata
// e.g. ##ID=<Description="ClinVar Variation ID">
this.metadata[r] = keyVals
}
} else {
this.metadata[r] = metaVal
}
Expand All @@ -187,8 +196,8 @@ export default class VCFParser {
* and 2) an object with the other key-value pairs in the metadata
*/
private parseStructuredMetaVal(metaVal: string) {
const keyVals = this.parseKeyValue(metaVal.replace(/^<|>$/g, ''), ',')
const id = keyVals.ID as string
const keyVals = parseMetaString(metaVal)
const id = keyVals.ID!
delete keyVals.ID
if ('Number' in keyVals) {
if (!Number.isNaN(Number(keyVals.Number))) {
Expand Down Expand Up @@ -218,69 +227,10 @@ export default class VCFParser {
return filteredMetadata
}

/**
* Sometimes VCFs have key-value strings that allow the separator within the
* value if it's in quotes, like:
* 'ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129"'
*
* Parse this at a low level since we can't just split at "," (or whatever
* separator). Above line would be parsed to: {ID: 'DB', Number: '0', Type:
* 'Flag', Description: 'dbSNP membership, build 129'}
*/
private parseKeyValue(str: string, pairSeparator = ';') {
const data = {} as Record<string, unknown>
let currKey = ''
let currValue = ''

// states:
// 1: read key to = or pair sep
// 2: read value to sep or quote
// 3: read value to quote
let state = 1
for (const s of str) {
if (state === 1) {
// read key to = or pair sep
if (s === '=') {
state = 2
} else if (s !== pairSeparator) {
currKey += s
} else if (currValue === '') {
data[currKey] = undefined
currKey = ''
}
} else if (state === 2) {
// read value to pair sep or quote
if (s === pairSeparator) {
data[currKey] = currValue
currKey = ''
currValue = ''
state = 1
} else if (s === '"') {
state = 3
} else {
currValue += s
}
} else if (state === 3) {
// read value to quote
if (s !== '"') {
currValue += s
} else {
state = 2
}
}
}
if (state === 2 || state === 3) {
data[currKey] = currValue
} else if (state === 1) {
data[currKey] = undefined
}
return data
}

/**
* Parse a VCF line into an object like
*
* ```json
* ```typescript
* {
* CHROM: 'contigA',
* POS: 3000,
Expand Down Expand Up @@ -350,7 +300,12 @@ export default class VCFParser {
const info =
fields[7] === undefined || fields[7] === '.'
? {}
: this.parseKeyValue(fields[7])
: Object.fromEntries(
fields[7].split(';').map(r => {
const ret = r.split('=')
return [ret[0], ret[1]]
}),
)

for (const key of Object.keys(info)) {
const items = (info[key] as string | undefined)
Expand Down
60 changes: 60 additions & 0 deletions src/parseMetaString.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// constructed with the assistance of claude AI
//
// I first prompted it with a regex that splits a comma separated string with
// awareness of quotation from this stackoverflow queston
// https://stackoverflow.com/a/18893443/2129219, and asked it to add support
// for square brackets
//
// the result was this function
function customSplit(str: string) {
const result = []
let current = ''
let inQuotes = false
let inBrackets = false

for (const char of str) {
if (char === '"') {
inQuotes = !inQuotes
current += char
} else if (char === '[') {
inBrackets = true
current += char
} else if (char === ']') {
inBrackets = false
current += char
} else if (char === ',' && !inQuotes && !inBrackets) {
result.push(current.trim())
current = ''
} else {
current += char
}
}

if (current) {
result.push(current.trim())
}

return result
}

export function parseMetaString(metaString: string) {
const inside = metaString.replace(/^<|>$/g, '')
return Object.fromEntries(
customSplit(inside).map(f => {
const [key, val] = f.split('=').map(f => f.trim())
if (val && val.startsWith('[') && val.endsWith(']')) {
return [
key,
val
.slice(1, -1)
.split(',')
.map(f => f.trim()),
]
} else if (val && val.startsWith('"') && val.endsWith('"')) {
return [key, val.slice(1, -1)]
} else {
return [key, val?.replaceAll(/^"|"$/g, '')]
}
}),
)
}
Loading

0 comments on commit 5950b41

Please sign in to comment.