DNAのサンプルデータをDNAのデータベースと比較し、誰のDNAかを鑑定する課題。

 

DNA

from csv import reader, DictReader
from sys import argv, exit

def main():
    # Handle command line arguments
    if len(argv) != 3:
        print("Usage: python dna.py data.csv sequence.txt")
        exit(1)

    # Open csv file and convert to dict
    with open(argv[1], "r") as csvfile:
        reader = DictReader(csvfile)
        dict_list = list(reader)

    # Open sequences file and convert to list
    with open(argv[2], "r") as txtfile:
        sequence = txtfile.read()

    # For each STR, compute longest run of consecutive repeats in sequence
    max_counts = []

    # Loop through each of the STRs and check them in sequence
    for i in range(1, len(reader.fieldnames)):
        STR = reader.fieldnames[i]

        max_counts.append(0)

        # Loop through sequence to find STR
        for j in range(len(sequence)):
            STR_count = 0

            # If match found, start counting repeats
            if sequence[j:(j + len(STR))] == STR:
                k = 0
                while sequence[(j + k):(j + k + len(STR))] == STR:
                    STR_count += 1
                    k += len(STR)

                # If new maximum of repeats, update max_counts
                if STR_count > max_counts[i - 1]:
                    max_counts[i - 1] = STR_count

    # Compare against data
    for i in range(len(dict_list)):
        matches = 0
        for j in range(1, len(reader.fieldnames)):
            if int(max_counts[j - 1]) == int(dict_list[i][reader.fieldnames[j]]):
                matches += 1

            if matches == (len(reader.fieldnames) - 1):
                print(dict_list[i]['name'])
                exit(0)

    print("No match")

main()