Please could you add a command so that I can remove rare kmers from a merge file? I'v

I made a hacky python to remove any kmer which occurs in fewer than 80% of samp

Remove rare kmers from merge file about ska HOT 2 CLOSED

bewt85 commented on June 18, 2024

Remove rare kmers from merge file

from ska.

Comments (2)

bewt85 commented on June 18, 2024

I made a hacky python script to remove any kmer which occurs in fewer than 80% of samples. It ran in between 1m50 and 2m20 on merge files of size 579M and 469M respectivly (around 1200 Salmonella Enterica assemblies)

import math
import sys

INPUT = sys.argv[1]
OUTPUT = sys.argv[2]
THRESHOLD = 0.8

def parse_count(ascii_field):
    bits = ''.join(['{0:06b}'.format(ord(c)-33) for c in ascii_field])
    return bits.count('1')

kmer_counts = {}

with open(INPUT, 'r') as f_in, open(OUTPUT, 'w') as f_out:
    k = int(f_in.readline().strip())
    kmer_length = 1 + ((k * 2) // 3)
    samples = f_in.readline().strip().split(' ')
    nSamples = len(samples)
    threshold = nSamples * THRESHOLD
    ascii_length = math.ceil(nSamples / 6)
    n_kmers = 2
    
    def parse_line(line):
        ascii_ = line[:ascii_length]
        count = parse_count(line[:ascii_length])
        kmers = ((line[i], line[i+1:i+kmer_length]) for i in range(ascii_length, len(line), kmer_length))
        return ascii_, count, kmers
    
    for line in f_in:
        _, count, kmers = parse_line(line.rstrip())
        for _, kmer in kmers:
            kmer_counts[kmer] = kmer_counts.get(kmer, 0) + count
            n_kmers += 1
            if (n_kmers % 1000000) == 0:
                print(f"Parsed {n_kmers} kmers")
            
    kmers_to_keep = {kmer for kmer, count in kmer_counts.items() if count > threshold}
    print(f"Found {len(kmers_to_keep)} kmers to keep")
    f_in.seek(0)
    # HEADERS
    f_out.write(f_in.readline())
    f_out.write(f_in.readline())
    
    for line in f_in:
        ascii_, _, kmers = parse_line(line.rstrip())
        outline = "".join([ascii_] + [ _ for snp, kmer in kmers for _ in (snp, kmer) if kmer in kmers_to_keep] + ["\n"])
        if len(outline) > ascii_length + 1:
            f_out.write(outline)

from ska.