Rosalind Textbook track 문제풀이
Phillip Compeau 와 Pavel Pevzner 가 쓴 책 “능동적 접근 방식의 생물정보학 알고리즘” 에서 제공되는 연습 문제 모음입니다.
Rosalind 는 프로젝트 오일러, 구글 코드 잼 에서 영감을 얻었습니다. 이 프로젝트의 이름은 DNA 이중나선을 발견하는 데 기여한 로잘린드 프랭클린 에서 따왔습니다. Rosalind 는 프로그래밍 실력을 키우고자 하는 생물학자와 분자생물학의 계산 문제를 접해본 적이 없는 프로그래머들에게 도움이 될 것입니다.
0.1 Compute the Number of Times a Pattern Appears in a Text
This is the first problem in a collection of”code challenges”to accompany Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
A k-mer is a string of length k. We define Count(Text, Pattern) as the number of times that a k-mer Pattern appears as a substring of Text.
For example, We note that \(Count(CGATATATCCATAGCGATATATCCATAG,ATAATA)\) is equal to 3 (not 2) since we should account for overlapping occurrences of Pattern in Text.
Given: {DNA strings}} Text and Pattern.
Return: Count(Text, Pattern).
0.2 Sample Dataset
GCGCG
GCG
0.3 Sample Output
2
0.4 Solution
from typing import Generator
def generate_substrings(text: str, size: int) -> Generator[str, None, None]:
"""Generate all substrings of a given size from the text."""
for i in range(len(text) - size + 1):
yield text[i:i + size]
def count_pattern_occurrences(text: str, pattern: str) -> int:
"""Count how many times the pattern occurs in the text."""
return sum(pattern == substring for substring in generate_substrings(text, len(pattern)))
# Sample input
= """
sample_input GCGCG
GCG
"""
# Split input into text and pattern
= sample_input.strip().split("\n")
text, pattern
# Print the count of pattern occurrences in text
print(count_pattern_occurrences(text, pattern))
1 Find the Most Frequent Words in a String
We say that Pattern is a most frequent k-mer in Text if it maximizes Count(Text, Pattern) among all k-mers. For example,“ACTAT”is a most frequent 5-mer in”ACAACTATGCATCACTATCGGGAACTATCCT”, and”ATA”is a most frequent 3-mer of”CGATATATCCATAG”.
Given: A DNA string Text and an integer k.
Return: All most frequent k-mers in Text (in any order).
1.1 Sample Dataset
ACGTTGCATGTCGCATGATGCATGAGAGCT
4
1.2 Sample Output
CATG GCAT
1.3 Solution
from typing import List, Dict, Tuple
from collections import defaultdict
def generate_substrings(text: str, size: int) -> List[str]:
"""Generate all substrings of a given size from the text."""
return [text[i:i + size] for i in range(len(text) - size + 1)]
def count_kmers(text: str, k: int) -> Dict[str, int]:
"""Count occurrences of each k-mer in the text."""
= defaultdict(int)
kmer_counts for kmer in generate_substrings(text, k):
+= 1
kmer_counts[kmer] return kmer_counts
def most_frequent_kmers(kmer_counts: Dict[str, int]) -> List[str]:
"""Find the most frequent k-mers."""
= max(kmer_counts.values())
max_count return [kmer for kmer, count in kmer_counts.items() if count == max_count]
# Sample input
= """
sample_input ACGTTGCATGTCGCATGATGCATGAGAGCT
4
"""
# Split input into text and pattern size
= sample_input.strip().split("\n")
text, k = int(k)
k
# Find and print the most frequent k-mers
= most_frequent_kmers(count_kmers(text, k))
most_frequent print(*most_frequent)
2 Find the Reverse Complement of a String
Find the reverse complement of a DNA string.
Given: A DNA string Pattern.
Return: \(\overline{Pattern}\), the reverse complement of Pattern.
2.1 Sample Dataset
AAAACCCGGT
2.2 Sample Output
ACCGGGTTTT
2.3 Solution
def reverse_complement(seq: str) -> str:
"""Return the reverse complement of a DNA sequence."""
return seq[::-1].translate(str.maketrans("ACGT", "TGCA"))
# Sample input
= """
sample_input AAAACCCGGT
"""
# Process the input and print the reverse complement
= sample_input.strip().split()[0]
sequence print(reverse_complement(sequence))
3 Find All Occurrences of a Pattern in a String
Pattern Matching Problem, Find all occurrences of a pattern in a string.
Given: Strings Pattern and Genome.
Return: All starting positions in Genome where Pattern appears as a substring. Use 0-based indexing.
3.1 Sample Dataset
ATAT
GATATATGCATATACTT
3.2 Sample Output
1 3 9
3.3 Solution
from typing import List, Generator
def generate_substrings(text: str, size: int) -> List[str]:
"""Generate all substrings of a given size from the text."""
return [text[i:i + size] for i in range(len(text) - size + 1)]
def find_pattern_indices(text: str, pattern: str) -> Generator[int, None, None]:
"""Yield starting indices where the pattern is found in the text."""
for i, substring in enumerate(generate_substrings(text, len(pattern))):
if substring == pattern:
yield i
# Sample input
= """
sample_input ATAT
GATATATGCATATACTT
"""
# Split input into pattern and text
= sample_input.strip().split("\n")
pattern, text
# Print indices where the pattern is found
print(*find_pattern_indices(text, pattern))
4 Find Patterns Forming Clumps in a String
Clump Finding Problem, Find patterns forming clumps in a string.
Given: A string Genome, and integers k, L, and t.
Return: All distinct k-mers forming (L, t)-clumps in Genome.
4.1 Sample Dataset
CGGACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAATGTACGAATGTAATGTGCCTATGGC
5 75 4
4.2 Sample Output
CGACA GAAGA AATGT
4.3 Solution
from collections import defaultdict
from typing import List, Dict
def generate_substrings(text: str, size: int) -> List[str]:
"""Generate all substrings of a given size from the text."""
return [text[i:i + size] for i in range(len(text) - size + 1)]
def find_kmers(text: str, k: int) -> Dict[str, List[int]]:
"""Find positions of k-length kmers within the text."""
= defaultdict(list)
kmer_positions for i, substring in enumerate(generate_substrings(text, k)):
kmer_positions[substring].append(i)return kmer_positions
def has_clump(positions: List[int], L: int, t: int, k: int) -> bool:
"""Check if a given array of kmers at positions forms a clump of t within L."""
for i in range(len(positions) - t + 1):
if (positions[i + t - 1] + k - positions[i]) <= L:
return True
return False
# Sample input
= """
sample_input CGGACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAATGTACGAATGTAATGTGCCTATGGC
5 75 4
"""
# Split input into sequence and parameters
= sample_input.strip().split("\n")
seq, params = map(int, params.split())
k, L, t
# Find kmers and print those forming clumps
= find_kmers(seq, k)
kmers = [kmer for kmer in kmers if has_clump(kmers[kmer], L, t, k)]
clumps print(*clumps)
5 Find a Position in a Genome Minimizing the Skew
Minimum Skew Problem, Find a position in a genome minimizing the skew.
Given: A DNA string Genome.
Return: All integer(s) i minimizing Skew(*Prefix__i (Text)) over all values of i (from 0 to |Genome*|).
5.1 Sample Dataset
CCTATCGGTGGATTAGCATGTCCCTGTACGTTTCGCCGCGAACTAGTTCACACGGCTTGATGGCAAATGGTTTTTCCGGCGACCGTAATCGTCCACCGAG
5.2 Sample Output
53 97
5.3 Solution
from typing import Generator
def find_minima(seq: str) -> Generator[int, None, None]:
"""Find positions with the minimum skew in a DNA sequence."""
= [0]
skew = {"G": 1, "C": -1, "A": 0, "T": 0}
delta
for i, nucleotide in enumerate(seq):
+ delta[nucleotide])
skew.append(skew[i]
= min(skew)
min_skew return (i for i, value in enumerate(skew) if value == min_skew)
# Sample input
= """
sample_input CCTATCGGTGGATTAGCATGTCCCTGTACGTTTCGCCGCGAACTAGTTCACACGGCTTGATGGCAAATGGTTTTTCCGGCGACCGTAATCGTCCACCGAG
"""
# Process the input and print the positions with minimum skew
= sample_input.strip()
sequence print(*find_minima(sequence))
6 Compute the Hamming Distance Between Two Strings
Hamming Distance Problem, Compute the Hamming distance between two DNA strings.
Given: Two DNA strings.
Return: An integer value representing the Hamming distance.
6.1 Sample Dataset
GGGCCGTTGGT
GGACCGTTGAC
6.2 Sample Output
3
6.3 Solution
from itertools import zip_longest
from typing import Tuple
def calculate_hamming_distance(sequence1: str, sequence2: str) -> int:
return sum(base1 != base2 for base1, base2 in zip_longest(sequence1, sequence2, fillvalue=None))
def parse_dna_sequences(input_string: str) -> Tuple[str, str]:
return tuple(input_string.strip().split("\n"))
# Sample input
= """
Sample_input GGGCCGTTGGT
GGACCGTTGAC
"""
= parse_dna_sequences(Sample_input)
dna_sequence1, dna_sequence2 = calculate_hamming_distance(dna_sequence1, dna_sequence2)
hamming_distance print(hamming_distance)
7 Find All Approximate Occurrences of a Pattern in a String
Approximate Pattern Matching Problem, Find all approximate occurrences of a pattern in a string.
Given: Strings Pattern and Text along with an integer d.
Return: All starting positions where Pattern appears as a substring of Text with at most d mismatches.
7.1 Sample Dataset
ATTCTGGA
CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAATGCCTAGCGGCTTGTGGTTTCTCCTACGCTCC
3
7.2 Sample Output
6 7 26 27 78
7.3 Solution
from typing import Iterator, List
def generate_substrings(dna_sequence: str, substring_length: int) -> Iterator[str]:
return (dna_sequence[i:i + substring_length] for i in range(len(dna_sequence) - substring_length + 1))
def calculate_hamming_distance(sequence1: str, sequence2: str) -> int:
return sum(base1 != base2 for base1, base2 in zip(sequence1, sequence2))
def find_approximate_matches(pattern: str, genome: str, max_mismatch: int) -> Iterator[int]:
= len(pattern)
pattern_length return (position for position, substring in enumerate(generate_substrings(genome, pattern_length))
if calculate_hamming_distance(substring, pattern) <= max_mismatch)
def parse_input(input_data: str) -> tuple[str, str, int]:
= input_data.strip().split("\n")
pattern, genome, max_mismatch_str return pattern, genome, int(max_mismatch_str)
= """
sample_input ATTCTGGA
CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAATGCCTAGCGGCTTGTGGTTTCTCCTACGCTCC
3
"""
= parse_input(sample_input)
pattern, genome, max_mismatch = list(find_approximate_matches(pattern, genome, max_mismatch))
match_positions print(*match_positions)
8 Find the Most Frequent Words with Mismatches in a String
Frequent Words with Mismatches Problem, Find the most frequent k-mers with mismatches in a string.
Given: A string Text as well as integers k and d.
Return: All most frequent k-mers with up to d mismatches in Text.
8.1 Sample Dataset
ACGTTGCATGTCGCATGATGCATGAGAGCT
4 1
8.2 Sample Output
ATGC ATGT GATG
8.3 Solution
from collections import defaultdict
from itertools import product
from typing import Dict, List, Iterator, Tuple
def calculate_hamming_distance(sequence1: str, sequence2: str) -> int:
return sum(base1 != base2 for base1, base2 in zip(sequence1, sequence2))
def generate_substrings(dna_sequence: str, substring_length: int) -> Iterator[str]:
return (dna_sequence[i:i + substring_length] for i in range(len(dna_sequence) - substring_length + 1))
def count_kmers(dna_sequence: str, kmer_length: int) -> Dict[str, int]:
= defaultdict(int)
kmer_counts for kmer in generate_substrings(dna_sequence, kmer_length):
+= 1
kmer_counts[kmer] return kmer_counts
def find_most_frequent(kmer_counts: Dict[str, int]) -> List[str]:
= max(kmer_counts.values())
max_count return [kmer for kmer, count in kmer_counts.items() if count == max_count]
def generate_all_kmers(kmer_length: int) -> Iterator[str]:
return ("".join(bases) for bases in product("ACGT", repeat=kmer_length))
def count_approximate_kmers(observed_kmers: Dict[str, int], max_mismatches: int, kmer_length: int) -> Iterator[Tuple[str, int]]:
for potential_kmer in generate_all_kmers(kmer_length):
= sum(observed_kmers[observed_kmer]
count for observed_kmer in observed_kmers
if calculate_hamming_distance(potential_kmer, observed_kmer) <= max_mismatches)
if count > 0:
yield (potential_kmer, count)
def parse_input(input_data: str) -> Tuple[str, int, int]:
= input_data.strip().split("\n")
dna_sequence, params = map(int, params.split())
kmer_length, max_mismatches return dna_sequence, kmer_length, max_mismatches
= """
sample_input ACGTTGCATGTCGCATGATGCATGAGAGCT
4 1
"""
= parse_input(sample_input)
dna_sequence, kmer_length, max_mismatches = count_kmers(dna_sequence, kmer_length)
observed_kmers = dict(count_approximate_kmers(observed_kmers, max_mismatches, kmer_length))
approximate_kmer_counts = find_most_frequent(approximate_kmer_counts)
most_frequent_kmers print(*most_frequent_kmers)
9 Find Frequent Words with Mismatches and Reverse Complements
Frequent Words with Mismatches and Reverse Complements Problem. Find the most frequent k-mers (with mismatches and reverse complements) in a DNA string.
Given: A DNA string Text as well as integers k and d.
Return: All k-mers Pattern maximizing the sum *Count__d(Text, \(Pattern\)) + Count__d(Text, \(\overline{Pattern}\)}) over all possible k*-mers.
9.1 Sample Dataset
ACGTTGCATGTCGCATGATGCATGAGAGCT
4 1
9.2 Sample Output
ATGT ACAT
9.3 Solution
from collections import defaultdict
from itertools import product
from typing import Dict, List, Iterator, Tuple
def reverse_complement(dna: str) -> str:
return dna[::-1].translate(str.maketrans("ACGT", "TGCA"))
def hamming_distance(seq1: str, seq2: str) -> int:
return sum(base1 != base2 for base1, base2 in zip(seq1, seq2))
def generate_substrings(dna: str, length: int) -> Iterator[str]:
return (dna[i:i + length] for i in range(len(dna) - length + 1))
def count_kmers(dna: str, kmer_length: int) -> Dict[str, int]:
= defaultdict(int)
kmer_counts for kmer in generate_substrings(dna, kmer_length):
+= 1
kmer_counts[kmer] return kmer_counts
def find_most_frequent(kmer_counts: Dict[str, int]) -> List[str]:
= max(kmer_counts.values())
max_count return [kmer for kmer, count in kmer_counts.items() if count == max_count]
def generate_all_kmers(kmer_length: int) -> Iterator[str]:
return ("".join(bases) for bases in product("ACGT", repeat=kmer_length))
def count_approximate_kmers(kmer_counts: Dict[str, int], max_mismatches: int, kmer_length: int) -> Iterator[Tuple[str, int]]:
for potential_kmer in generate_all_kmers(kmer_length):
= sum(kmer_counts[observed_kmer] for observed_kmer in kmer_counts
count if hamming_distance(potential_kmer, observed_kmer) <= max_mismatches)
+= sum(kmer_counts[observed_kmer] for observed_kmer in kmer_counts
count if hamming_distance(reverse_complement(potential_kmer), observed_kmer) <= max_mismatches)
if count > 0:
yield (potential_kmer, count)
def parse_input(input_data: str) -> Tuple[str, int, int]:
= input_data.strip().split("\n")
dna_sequence, params = map(int, params.split())
kmer_length, max_mismatches return dna_sequence, kmer_length, max_mismatches
= """
sample_input ACGTTGCATGTCGCATGATGCATGAGAGCT
4 1
"""
= parse_input(sample_input)
dna_sequence, kmer_length, max_mismatches = count_kmers(dna_sequence, kmer_length)
kmer_counts = dict(count_approximate_kmers(kmer_counts, max_mismatches, kmer_length))
approximate_kmer_counts print(*find_most_frequent(approximate_kmer_counts))
10 Generate the Frequency Array of a String
Computing a Frequency Array, Generate the frequency array of a DNA string.
Given: A DNA string Text and an integer k.
Return: The frequency array of k-mers in Text.
10.1 Sample Dataset
ACGCGGCTCTGAAA
2
10.2 Sample Output
2 1 0 0 0 0 2 2 1 2 1 0 0 1 1 0
10.3 Solution
from typing import Iterator, List, Dict, Tuple
from itertools import product
def generate_substrings(text: str, size: int) -> Iterator[str]:
return (text[i : i + size] for i in range(len(text) - size + 1))
def count_pattern_occurrences(text: str, pattern: str) -> int:
return sum(pattern == substring for substring in generate_substrings(text, len(pattern)))
def calculate_hamming_distance(s1: str, s2: str) -> int:
return sum(c1 != c2 for c1, c2 in zip(s1, s2))
def generate_kmers(k: int) -> Iterator[str]:
return ("".join(bases) for bases in product("ACGT", repeat=k))
def count_approximate_kmers(kmer_counts: Dict[str, int], max_mismatches: int, kmer_length: int) -> Iterator[Tuple[str, int]]:
for potential_kmer in generate_kmers(kmer_length):
= sum(kmer_counts[observed_kmer] for observed_kmer in kmer_counts
count if calculate_hamming_distance(potential_kmer, observed_kmer) <= max_mismatches)
if count > 0:
yield (potential_kmer, count)
def calculate_kmer_frequencies(sequence: str, kmer_length: int) -> List[int]:
return [count_pattern_occurrences(sequence, kmer) for kmer in generate_kmers(kmer_length)]
def parse_input(input_data: str) -> Tuple[str, int]:
= input_data.strip().split("\n")
sequence, kmer_length return sequence, int(kmer_length)
= """
sample_input ACGCGGCTCTGAAA
2
"""
= parse_input(sample_input)
sequence, kmer_length = calculate_kmer_frequencies(sequence, kmer_length)
kmer_frequencies print(*kmer_frequencies
11 Implement PatternToNumber
Implement PatternToNumber, Convert a DNA string to a number.
Given: A DNA string Pattern.
Return: PatternToNumber(Pattern).
11.1 Sample Dataset
AGT
11.2 Sample Output
11
11.3 Solution
from typing import Dict, Tuple
def create_nucleotide_to_number_map() -> Dict[str, int]:
return {"A": 0, "C": 1, "G": 2, "T": 3}
def convert_nucleotide_to_number(nucleotide: str, nucleotide_map: Dict[str, int]) -> int:
return nucleotide_map[nucleotide]
def convert_dna_pattern_to_number(dna_pattern: str, nucleotide_map: Dict[str, int]) -> int:
if not dna_pattern:
return 0
return 4 * convert_dna_pattern_to_number(dna_pattern[:-1], nucleotide_map) + convert_nucleotide_to_number(dna_pattern[-1], nucleotide_map)
def parse_input(input_data: str) -> str:
return input_data.strip()
= """
sample_input AGT
"""
= parse_input(sample_input)
dna_pattern = create_nucleotide_to_number_map()
nucleotide_map = convert_dna_pattern_to_number(dna_pattern, nucleotide_map)
result print(result)
12 Implement NumberToPattern
Implement NumberToPattern, Convert an integer to its corresponding DNA string.
Given: Integers index and k.
Return: NumberToPattern(index, k).
12.1 Sample Dataset
45
4
12.2 Sample Output
AGTC
12.3 Solution
from typing import Tuple
def number_to_nucleotide(index: int) -> str:
= ["A", "C", "G", "T"]
nucleotides return nucleotides[index]
def number_to_dna_pattern(index: int, length: int) -> str:
if length == 1:
return number_to_nucleotide(index)
= divmod(index, 4)
quotient, remainder return number_to_dna_pattern(quotient, length - 1) + number_to_nucleotide(remainder)
def parse_input(input_data: str) -> Tuple[int, int]:
= input_data.strip().split("\n")
index_str, length_str return int(index_str), int(length_str)
= """
sample_input 45
4
"""
= parse_input(sample_input)
index, length = number_to_dna_pattern(index, length)
dna_pattern print(dna_pattern)
13 Generate the d-Neighborhood of a String
Generate the d-Neighborhood of a String Find all the neighbors of a pattern.
Given: A DNA string Pattern and an integer d.
Return: The collection of strings Neighbors(Pattern, d).
13.1 Sample Dataset
ACG
1
13.2 Sample Output
CCG
TCG
GCG
AAG
ATG
AGG
ACA
ACC
ACT
ACG
13.3 Solution
from typing import Set, List, Tuple, Iterator
def calculate_hamming_distance(seq1: str, seq2: str) -> int:
return sum(base1 != base2 for base1, base2 in zip(seq1, seq2))
def generate_immediate_neighbors(sequence: str) -> Iterator[str]:
= ["A", "T", "G", "C"]
nucleotides for i, current_base in enumerate(sequence):
for new_base in nucleotides:
if new_base != current_base:
yield sequence[:i] + new_base + sequence[i + 1:]
def generate_neighbors(sequence: str, max_distance: int) -> Set[str]:
= ["A", "T", "G", "C"]
nucleotides if max_distance == 0:
return {sequence}
if len(sequence) == 1:
return set(nucleotides)
= set()
neighbors = generate_neighbors(sequence[1:], max_distance)
suffix_neighbors for suffix in suffix_neighbors:
if calculate_hamming_distance(sequence[1:], suffix) < max_distance:
+ suffix for base in nucleotides)
neighbors.update(base else:
0] + suffix)
neighbors.add(sequence[return neighbors
def parse_input(input_data: str) -> Tuple[str, int]:
= input_data.strip().split("\n")
sequence, distance return sequence, int(distance)
= """
sample_input ACG
1
"""
= parse_input(sample_input)
sequence, max_distance = generate_neighbors(sequence, max_distance)
neighbor_sequences print(*sorted(neighbor_sequences), sep="\n")
14 Implement MotifEnumeration
Implanted Motif Problem. Implement MotifEnumeration (shown above) to find all (k, d)-motifs in a collection of strings.
Given: Integers k and d, followed by a collection of strings Dna.
Return: All (k, d)-motifs in Dna.
14.1 Sample Dataset
3 1
ATTTGGC
TGCCTTA
CGGTATC
GAAAATT
14.2 Sample Output
ATA ATT GTT TTT
14.3 Solution
from typing import List, Set, Iterator
def calculate_hamming_distance(sequence1: str, sequence2: str) -> int:
return sum(c1 != c2 for c1, c2 in zip(sequence1, sequence2))
def generate_neighbors(sequence: str, max_distance: int) -> Set[str]:
= ["A", "T", "G", "C"]
nucleotides if max_distance == 0:
return {sequence}
if len(sequence) == 1:
return set(nucleotides)
= set()
neighbor_set for neighbor in generate_neighbors(sequence[1:], max_distance):
if calculate_hamming_distance(sequence[1:], neighbor) < max_distance:
for nucleotide in nucleotides:
+ neighbor)
neighbor_set.add(nucleotide else:
0] + neighbor)
neighbor_set.add(sequence[return neighbor_set
def generate_substrings(text: str, substring_length: int) -> Iterator[str]:
for i in range(len(text) - substring_length + 1):
yield text[i : i + substring_length]
def get_all_kmers(dna_sequences: List[str], kmer_length: int) -> Set[str]:
return set(kmer for sequence in dna_sequences for kmer in generate_substrings(sequence, kmer_length))
def contains_approximate_match(pattern: str, text: str, max_distance: int) -> bool:
return any(calculate_hamming_distance(substring, pattern) <= max_distance
for substring in generate_substrings(text, len(pattern)))
def enumerate_motifs(dna_sequences: List[str], kmer_length: int, max_distance: int) -> Set[str]:
= set()
motif_patterns for kmer in get_all_kmers(dna_sequences, kmer_length):
for neighbor_kmer in generate_neighbors(kmer, max_distance):
if all(contains_approximate_match(neighbor_kmer, sequence, max_distance) for sequence in dna_sequences):
motif_patterns.add(neighbor_kmer)return motif_patterns
# Sample input
= """
sample_input 3 1
ATTTGGC
TGCCTTA
CGGTATC
GAAAATT
"""
*dna_sequences = sample_input.strip().split("\n")
input_params, = map(int, input_params.split())
kmer_length, max_distance print(*sorted(enumerate_motifs(dna_sequences, kmer_length, max_distance)))
15 Find a Median String
Median String Problem, Find a median string.
Given: An integer k and a collection of strings Dna.
Return: A k-mer Pattern that minimizes d(Pattern, Dna) over all k-mers Pattern. (If multiple answers exist, you may return any one.)
15.1 Sample Dataset
3
AAATTGACGCAT
GACGACCACGTT
CGTCAGCGCCTG
GCTGAGCACCGG
AGTACGGGACAG
15.2 Sample Output
ACG
15.3 Solution
from typing import Iterator, List
from itertools import product
import math
def generate_substrings(text: str, substring_length: int) -> Iterator[str]:
for i in range(len(text) - substring_length + 1):
yield text[i : i + substring_length]
def generate_kmers(kmer_length: int) -> Iterator[str]:
return ("".join(nucleotides) for nucleotides in product("ACGT", repeat=kmer_length))
def calculate_hamming_distance(sequence1: str, sequence2: str) -> int:
return sum(nucleotide1 != nucleotide2 for nucleotide1, nucleotide2 in zip(sequence1, sequence2))
def find_minimum_distance(pattern: str, text: str) -> int:
return min(calculate_hamming_distance(substring, pattern) for substring in generate_substrings(text, len(pattern)))
def calculate_total_distance(pattern: str, dna_sequences: List[str]) -> int:
return sum(find_minimum_distance(pattern, sequence) for sequence in dna_sequences)
def find_median_string(dna_sequences: List[str], kmer_length: int) -> str:
= math.inf
min_distance = ""
median_kmer
for kmer in generate_kmers(kmer_length):
= calculate_total_distance(kmer, dna_sequences)
current_distance if current_distance < min_distance:
= current_distance
min_distance = kmer
median_kmer
return median_kmer
# Sample input
= """
sample_input 3
AAATTGACGCAT
GACGACCACGTT
CGTCAGCGCCTG
GCTGAGCACCGG
AGTACGGGACAG
"""
*dna_sequences = sample_input.strip().split("\n")
kmer_length, = find_median_string(dna_sequences, int(kmer_length))
result print(result)
16 Find a Profile-most Probable k-mer in a String
Profile-most Probable k-mer Problem, Find a Profile-most probable k-mer in a string.
Given: A string Text, an integer k, and a 4 × k matrix Profile.
Return: A Profile-most probable k-mer in Text. (If multiple answers exist, you may return any one.)
16.1 Sample Dataset
ACCTGTTTATTGCCTAAGTTCCGAACAAACCCAATATAGCCCGAGGGCCT
5
0.2 0.2 0.3 0.2 0.3
0.4 0.3 0.1 0.5 0.1
0.3 0.3 0.5 0.2 0.4
0.1 0.2 0.1 0.1 0.2
16.2 Sample Output
CCGAG
16.3 Solution
from typing import Iterator, List
import math
def generate_substrings(text: str, substring_length: int) -> Iterator[str]:
for i in range(len(text) - substring_length + 1):
yield text[i : i + substring_length]
def find_profile_most_probable_kmer(sequence: str, kmer_length: int, profile_matrix: List[List[float]]) -> str:
= {"A": 0, "C": 1, "G": 2, "T": 3}
nucleotide_index = -1
max_probability = ""
most_probable_kmer
for kmer in generate_substrings(sequence, kmer_length):
= math.prod(profile_matrix[nucleotide_index[kmer[j]]][j] for j in range(kmer_length))
kmer_probability if kmer_probability > max_probability:
= kmer_probability
max_probability = kmer
most_probable_kmer
return most_probable_kmer
# Sample input
= """
sample_input ACCTGTTTATTGCCTAAGTTCCGAACAAACCCAATATAGCCCGAGGGCCT
5
0.2 0.2 0.3 0.2 0.3
0.4 0.3 0.1 0.5 0.1
0.3 0.3 0.5 0.2 0.4
0.1 0.2 0.1 0.1 0.2
"""
*profile_rows = sample_input.strip().split("\n")
dna_sequence, kmer_length, = [list(map(float, row.split())) for row in profile_rows]
profile_matrix = find_profile_most_probable_kmer(dna_sequence, int(kmer_length), profile_matrix)
result print(result)
17 Implement GreedyMotifSearch
Implement, GreedyMotifSearch.
Given: Integers k and t, followed by a collection of strings Dna.
Return: A collection of strings BestMotifs resulting from running GreedyMotifSearch(Dna, k, t). If at any step you find more than one Profile-most probable k-mer in a given string, use the one occurring first.
17.1 Sample Dataset
3 5
GGCGTTCAGGCA
AAGAATCAGTCA
CAAGGAGTTCGC
CACGTCAATCAC
CAATAATATTCG
17.2 Sample Output
CAG
CAG
CAA
CAA
CAA
17.3 Solution
from typing import Iterator, List, Dict
from collections import Counter
import math
def generate_kmers(sequence: str, kmer_length: int) -> Iterator[str]:
for i in range(len(sequence) - kmer_length + 1):
yield sequence[i : i + kmer_length]
def find_most_probable_kmer(sequence: str, kmer_length: int, profile: List[List[float]]) -> str:
str, int] = {"A": 0, "C": 1, "G": 2, "T": 3}
nucleotide_to_index: Dict[float = -1
max_probability: str = ""
most_probable_kmer:
for kmer in generate_kmers(sequence, kmer_length):
float = math.prod(profile[nucleotide_to_index[kmer[j]]][j] for j in range(kmer_length))
kmer_probability: if kmer_probability > max_probability:
= kmer_probability
max_probability = kmer
most_probable_kmer
return most_probable_kmer
def create_profile(sequences: List[str], pseudocount: int = 0) -> List[List[float]]:
str] = ["A", "C", "G", "T"]
nucleotides: List[float]] = [[] for _ in nucleotides]
profile: List[List[for i, nucleotide in enumerate(nucleotides):
= [
profile[i] sum(seq[j] == nucleotide for seq in sequences) + pseudocount) / len(sequences)
(for j in range(len(sequences[0]))
]return profile
def calculate_score(motifs: List[str]) -> int:
int = 0
score: for i in range(len(motifs[0])):
str] = [motif[i] for motif in motifs]
column: List[str = Counter(column).most_common()[0][0]
most_common: += sum(nucleotide != most_common for nucleotide in column)
score return score
def greedy_motif_search(dna_sequences: List[str], kmer_length: int, pseudocount: int = 0) -> List[str]:
str] = [seq[:kmer_length] for seq in dna_sequences]
best_motifs: List[for kmer in generate_kmers(dna_sequences[0], kmer_length):
str] = [kmer]
current_motifs: List[for i in range(1, len(dna_sequences)):
float]] = create_profile(current_motifs, pseudocount=pseudocount)
current_profile: List[List[
current_motifs.append(find_most_probable_kmer(dna_sequences[i], kmer_length, current_profile))if calculate_score(current_motifs) < calculate_score(best_motifs):
= current_motifs
best_motifs return best_motifs
# Sample input
str = """
sample_input: 3 5
GGCGTTCAGGCA
AAGAATCAGTCA
CAAGGAGTTCGC
CACGTCAATCAC
CAATAATATTCG
"""
*dna = sample_input.strip().split("\n")
ints, = map(int, ints.split())
k, t str] = greedy_motif_search(dna, k)
result: List[print(*result, sep="\n")
18 Implement GreedyMotifSearch with Pseudocounts
Implement. GreedyMotifSearch with Pseudocounts.
Given: Integers k and t, followed by a collection of strings Dna.
Return: A collection of strings BestMotifs resulting from running GreedyMotifSearch(Dna, k, t) with pseudocounts. If at any step you find more than one Profile-most probable k-mer in a given string, use the one occurring first.
18.1 Sample Dataset
3 5
GGCGTTCAGGCA
AAGAATCAGTCA
CAAGGAGTTCGC
CACGTCAATCAC
CAATAATATTCG
18.2 Sample Output
TTC
ATC
TTC
ATC
TTC
18.3 Solution
from typing import Iterator, List, Dict
from collections import Counter
import math
def generate_kmers(sequence: str, kmer_length: int) -> Iterator[str]:
for i in range(len(sequence) - kmer_length + 1):
yield sequence[i : i + kmer_length]
def find_most_probable_kmer(sequence: str, kmer_length: int, profile: List[List[float]]) -> str:
str, int] = {"A": 0, "C": 1, "G": 2, "T": 3}
nucleotide_to_index: Dict[return max(
generate_kmers(sequence, kmer_length),=lambda kmer: math.prod(profile[nucleotide_to_index[nucleotide]][position]
keyfor position, nucleotide in enumerate(kmer))
)
def create_profile(motifs: List[str], pseudocount: int = 0) -> List[List[float]]:
str] = ["A", "C", "G", "T"]
nucleotides: List[int = len(motifs)
motif_count: int = len(motifs[0])
motif_length:
return [
sum(motif[position] == nucleotide for motif in motifs) + pseudocount) / motif_count
[(for position in range(motif_length)]
for nucleotide in nucleotides
]
def calculate_score(motifs: List[str]) -> int:
return sum(
sum(nucleotide != Counter(column).most_common(1)[0][0] for nucleotide in column)
for column in zip(*motifs)
)
def greedy_motif_search(dna_sequences: List[str], kmer_length: int, pseudocount: int = 0) -> List[str]:
str] = [sequence[:kmer_length] for sequence in dna_sequences]
best_motifs: List[
for kmer in generate_kmers(dna_sequences[0], kmer_length):
str] = [kmer]
current_motifs: List[for sequence in dna_sequences[1:]:
float]] = create_profile(current_motifs, pseudocount)
profile: List[List[
current_motifs.append(find_most_probable_kmer(sequence, kmer_length, profile))
if calculate_score(current_motifs) < calculate_score(best_motifs):
= current_motifs
best_motifs
return best_motifs
# Sample input
str = """
sample_input: 3 5
GGCGTTCAGGCA
AAGAATCAGTCA
CAAGGAGTTCGC
CACGTCAATCAC
CAATAATATTCG
"""
*dna_sequences = sample_input.strip().split()
k_value, _, = int(k_value)
k_value str] = greedy_motif_search(dna_sequences, k_value, pseudocount=1)
result: List[print(*result, sep="\n")
19 Implement RandomizedMotifSearch
Implement RandomizedMotifSearch.
Given: Positive integers k and t, followed by a collection of strings Dna.
Return: A collection BestMotifs resulting from running RandomizedMotifSearch(Dna, k, t) 1000 times. Remember to use pseudocounts!
19.1 Sample Dataset
8 5
CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA
GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG
TAGTACCGAGACCGAAAGAAGTATACAGGCGT
TAGATCAAGTTTCAGGTGCACGTCGGTGAACC
AATCCACCAGCTCCACGTGCAATGTTGGCCTA
19.2 Sample Output
AACGGCCA
AAGTGCCA
TAGTACCG
AAGTTTCA
ACGTGCAA
19.3 Solution
from typing import List, Tuple, Callable
from collections import Counter
from random import randint
import math
def generate_kmers(sequence: str, kmer_length: int) -> List[str]:
return [sequence[i:i+kmer_length] for i in range(len(sequence) - kmer_length + 1)]
def find_most_probable_kmer(sequence: str, kmer_length: int, profile: List[List[float]]) -> str:
= {"A": 0, "C": 1, "G": 2, "T": 3}
nucleotide_to_index return max(
generate_kmers(sequence, kmer_length),=lambda kmer: math.prod(profile[nucleotide_to_index[nucleotide]][j] for j, nucleotide in enumerate(kmer))
key
)
def create_profile(motifs: List[str], pseudocount: int = 0) -> List[List[float]]:
= ["A", "C", "G", "T"]
nucleotides = []
profile for nucleotide in nucleotides:
profile.append([sum(seq[j] == nucleotide for seq in motifs) + pseudocount) / (len(motifs) + 4 * pseudocount)
(for j in range(len(motifs[0]))
])return profile
def calculate_score(motifs: List[str]) -> int:
return sum(
sum(nucleotide != Counter(column).most_common(1)[0][0] for nucleotide in column)
for column in zip(*motifs)
)
def generate_random_kmer(sequence: str, kmer_length: int) -> str:
= randint(0, len(sequence) - kmer_length)
start return sequence[start : start + kmer_length]
def find_motifs(profile: List[List[float]], dna_sequences: List[str]) -> List[str]:
= len(profile[0])
kmer_length return [find_most_probable_kmer(seq, kmer_length, profile) for seq in dna_sequences]
def randomized_motif_search(dna_sequences: List[str], kmer_length: int) -> Tuple[int, List[str]]:
= [generate_random_kmer(seq, kmer_length) for seq in dna_sequences]
motifs = math.inf
best_score
while True:
= create_profile(motifs, pseudocount=1)
profile = find_motifs(profile, dna_sequences)
motifs = calculate_score(motifs)
current_score
if current_score >= best_score:
return best_score, motifs
= current_score
best_score
def find_best_motifs(search_function: Callable, iterations: int, *args) -> List[str]:
= search_function(*args)
best_score, best_motifs
for _ in range(iterations - 1):
= search_function(*args)
score, motifs if score < best_score:
= score, motifs
best_score, best_motifs
return best_motifs
# Sample input
= """
sample_input 8 5
CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA
GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG
TAGTACCGAGACCGAAAGAAGTATACAGGCGT
TAGATCAAGTTTCAGGTGCACGTCGGTGAACC
AATCCACCAGCTCCACGTGCAATGTTGGCCTA
"""
*dna_sequences = sample_input.strip().split()
kmer_length, _, = int(kmer_length)
kmer_length
= find_best_motifs(randomized_motif_search, 1000, dna_sequences, kmer_length)
result print(*result, sep="\n")
20 Implement GibbsSampler
Implement, GibbsSampler.
Given: Integers k, t, and N, followed by a collection of strings Dna.
Return: The strings BestMotifs resulting from running GibbsSampler(Dna, k, t, N) with 20 random starts. Remember to use pseudocounts!
20.1 Sample Dataset
8 5 100
CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA
GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG
TAGTACCGAGACCGAAAGAAGTATACAGGCGT
TAGATCAAGTTTCAGGTGCACGTCGGTGAACC
AATCCACCAGCTCCACGTGCAATGTTGGCCTA
20.2 Sample Output
TCTCGGGG
CCAAGGTG
TACAGGCG
TTCAGGTG
TCCACGTG
20.3 Solution
from typing import List, Iterator, Tuple, Callable
from collections import Counter
import math
import random
def generate_kmers(sequence: str, kmer_length: int) -> Iterator[str]:
for i in range(len(sequence) - kmer_length + 1):
yield sequence[i : i + kmer_length]
def create_profile(motifs: List[str], pseudocount: int = 0) -> List[List[float]]:
str] = ["A", "C", "G", "T"]
nucleotides: List[int = len(motifs)
motif_count: int = len(motifs[0])
motif_length:
return [
sum(motif[position] == nucleotide for motif in motifs) + pseudocount) / (motif_count + 4*pseudocount)
[(for position in range(motif_length)]
for nucleotide in nucleotides
]
def calculate_score(motifs: List[str]) -> int:
return sum(
sum(nucleotide != Counter(column).most_common(1)[0][0] for nucleotide in column)
for column in zip(*motifs)
)
def generate_random_kmer(sequence: str, kmer_length: int) -> str:
= random.randint(0, len(sequence) - kmer_length)
start_index return sequence[start_index : start_index + kmer_length]
def find_best_motifs(search_function: Callable, iterations: int, *args) -> List[str]:
= search_function(*args)
best_score, best_motifs for _ in range(iterations - 1):
= search_function(*args)
score, motifs if score < best_score:
= score, motifs
best_score, best_motifs return best_motifs
def calculate_kmer_probabilities(sequence: str, kmer_length: int, profile: List[List[float]]) -> List[float]:
str, int] = {"A": 0, "C": 1, "G": 2, "T": 3}
nucleotide_to_index: Dict[return [
for j in range(kmer_length))
math.prod(profile[nucleotide_to_index[kmer[j]]][j] for kmer in generate_kmers(sequence, kmer_length)
]
def select_random_kmer(sequence: str, kmer_length: int, profile: List[List[float]]) -> str:
= calculate_kmer_probabilities(sequence, kmer_length, profile)
probabilities = random.choices(range(len(probabilities)), probabilities)[0]
start_index return sequence[start_index : start_index + kmer_length]
def gibbs_sampler(dna_sequences: List[str], kmer_length: int, num_iterations: int) -> Tuple[int, List[str]]:
= [generate_random_kmer(seq, kmer_length) for seq in dna_sequences]
motifs = motifs.copy()
best_motifs for _ in range(num_iterations):
= random.randint(0, len(dna_sequences) - 1)
i = create_profile(motifs[:i] + motifs[i + 1 :], pseudocount=1)
profile = select_random_kmer(dna_sequences[i], kmer_length, profile)
motifs[i] if calculate_score(motifs) < calculate_score(best_motifs):
= motifs.copy()
best_motifs return calculate_score(best_motifs), best_motifs
# Sample input
= """
sample_input 8 5 100
TCTCGGGG
CCAAGGTG
TACAGGCG
TTCAGGTG
TCCACGTG
"""
*dna_sequences = sample_input.strip().split()
kmer_length, num_sequences, num_iterations, = int(kmer_length)
kmer_length = int(num_iterations)
num_iterations = find_best_motifs(gibbs_sampler, 20, dna_sequences, kmer_length, num_iterations)
result print(*result, sep="\n")
21 Implement DistanceBetweenPatternAndStrings
Compute DistanceBetweenPatternAndStrings. Find the distance between a pattern and a set of strings..
Given: A DNA string Pattern and a collection of DNA strings Dna.
Return: DistanceBetweenPatternAndStrings(Pattern, Dna).
21.1 Sample Dataset
AAA
TTACCTTAAC GATATCTGTC ACGGCGTTCG CCCTAAAGAG CGTCAGAGGT
21.2 Sample Output
5
21.3 Solution
from typing import List, Iterator
import math
def generate_kmers(sequence: str, kmer_length: int) -> Iterator[str]:
for i in range(len(sequence) - kmer_length + 1):
yield sequence[i : i + kmer_length]
def calculate_hamming_distance(seq1: str, seq2: str) -> int:
return sum(base1 != base2 for base1, base2 in zip(seq1, seq2))
def find_minimum_distance(pattern: str, text: str) -> int:
return min(calculate_hamming_distance(kmer, pattern) for kmer in generate_kmers(text, len(pattern)))
def calculate_pattern_distance_to_strings(pattern: str, dna_strings: List[str]) -> int:
return sum(find_minimum_distance(pattern, dna_string) for dna_string in dna_strings)
# Sample input
= """
sample_input AAA
TTACCTTAAC GATATCTGTC ACGGCGTTCG CCCTAAAGAG CGTCAGAGGT
"""
= sample_input.strip().split("\n")
pattern, dna_strings_raw = dna_strings_raw.split()
dna_strings
= calculate_pattern_distance_to_strings(pattern, dna_strings)
result print(result)
22 Generate the k-mer Composition of a String
String Composition Problem, Generate the k-mer composition of a string.
Given: An integer k and a string Text.
Return: *Composition__k(Text) (the k*-mers can be provided in any order).
22.1 Sample Dataset
5
CAATCCAAC
22.2 Sample Output
CAATC
AATCC
ATCCA
TCCAA
CCAAC
22.3 Solution
from typing import Iterator
def generate_kmers(sequence: str, kmer_length: int) -> Iterator[str]:
for i in range(len(sequence) - kmer_length + 1):
yield sequence[i:i + kmer_length]
str = """
sample_input: 5
CAATCCAAC
"""
list[str] = sample_input.strip().split("\n")
input_lines: int = int(input_lines[0])
kmer_length: str = input_lines[1]
dna_sequence:
for kmer in generate_kmers(dna_sequence, kmer_length):
print(kmer)
23 Reconstruct a String from its Genome Path
String Spelled by a Genome Path Problem, Find the string spelled by a genome path.
Given: A sequence of k-mers Pattern1,…, Patternn such that the last k - 1 symbols of Patterni are equal to the first k - 1 symbols of Patterni+1 for i from 1 to n-1.
Return: A string Text of length k+n-1 where the i-th k-mer in Text is equal to Patterni for all i.
23.1 Sample Dataset
ACCGA
CCGAA
CGAAG
GAAGC
AAGCT
23.2 Sample Output
ACCGAAGCT
23.3 Solution
from typing import List
def reconstruct_dna_sequence(kmers: List[str]) -> str:
str = kmers[0]
reconstructed_sequence: for i in range(1, len(kmers)):
+= kmers[i][-1]
reconstructed_sequence return reconstructed_sequence
str = """
sample_input: ACCGA
CCGAA
CGAAG
GAAGC
AAGCT
"""
str] = sample_input.strip().split("\n")
kmer_list: List[print(reconstruct_dna_sequence(kmer_list))
24 Construct the Overlap Graph of a Collection of k-mers
Overlap Graph Problem, Construct the overlap graph of a collection of k-mers.
Given: A collection Patterns of k-mers.
Return: The overlap graph Overlap(Patterns), in the form of an adjacency list.
24.1 Sample Dataset
ATGCG
GCATG
CATGC
AGGCA
GGCAT
24.2 Sample Output
GCATG -> CATGC
CATGC -> ATGCG
AGGCA -> GGCAT
GGCAT -> GCATG
24.3 Solution
from typing import List
def overlap_graph(patterns: List[str]) -> List[tuple[str, str]]:
= []
adj_list for i in range(len(patterns)):
for j in range(len(patterns)):
if i != j and patterns[i][1:] == patterns[j][:-1]:
adj_list.append((patterns[i], patterns[j]))return adj_list
= """
sample_input ATGCG
GCATG
CATGC
AGGCA
GGCAT
"""
str] = sample_input.strip().split("\n")
Patterns: List[
= overlap_graph(Patterns)
adj_list for edge in adj_list:
print(f"{edge[0]} -> {edge[1]}")
25 Construct the De Bruijn Graph of a String
De Bruijn Graph from a String Problem. Construct the de Bruijn graph of a string.
Given: An integer k and a string Text.
Return:*DeBruijn__k(Text*), in the form of an adjacency list.
25.1 Sample Dataset
4
AAGATTCTCTAC
25.2 Sample Output
AAG -> AGA
AGA -> GAT
ATT -> TTC
CTA -> TAC
CTC -> TCT
GAT -> ATT
TCT -> CTA,CTC
TTC -> TCT
25.3 Solution
from collections import OrderedDict
from typing import List, Set, OrderedDict as OrderedDictType
def construct_de_bruijn_graph(sequence: str, kmer_length: int) -> OrderedDictType[str, Set[str]]:
str, Set[str]] = OrderedDict()
adjacency_list: OrderedDictType[
for i in range(len(sequence) - kmer_length + 2):
+ kmer_length - 1]] = set()
adjacency_list[sequence[i:i
for i in range(len(sequence) - kmer_length + 1):
= sequence[i:i + kmer_length - 1]
prefix = sequence[i + 1:i + kmer_length]
suffix
adjacency_list[prefix].add(suffix)
return adjacency_list
str = """
sample_input: 4
AAGATTCTCTAC
"""
str] = sample_input.strip().split("\n")
input_lines: List[int = int(input_lines[0])
kmer_length: str = input_lines[1]
dna_sequence:
= construct_de_bruijn_graph(dna_sequence, kmer_length)
adjacency_list for node, neighbors in adjacency_list.items():
if neighbors:
print(f"{node} -> {','.join(neighbors)}")
26 Construct the De Bruijn Graph of a Collection of k-mers
De Bruijn Graph from k-mers Problem. Construct the de Bruijn graph from a collection of k-mers.
Given: A collection of k-mers Patterns.
Return: The de Bruijn graph DeBruijn(Patterns), in the form of an adjacency list.
26.1 Sample Dataset
GAGG
CAGG
GGGG
GGGA
CAGG
AGGG
GGAG
26.2 Sample Output
GAG -> AGG
CAG -> AGG,AGG
GGG -> GGG,GGA
AGG -> GGG
GGA -> GAG
26.3 Solution
from typing import List, Dict
def construct_de_bruijn_graph(kmers: List[str]) -> Dict[str, List[str]]:
str, List[str]] = {}
adjacency_list: Dict[for kmer in kmers:
= kmer[:-1]
prefix = kmer[1:]
suffix if prefix not in adjacency_list:
= [suffix]
adjacency_list[prefix] else:
adjacency_list[prefix].append(suffix)return adjacency_list
str = """
sample_input: GAGG
CAGG
GGGG
GGGA
CAGG
AGGG
GGAG
"""
str] = sample_input.strip().split("\n")
kmer_list: List[
= construct_de_bruijn_graph(kmer_list)
adjacency_list for node, neighbors in adjacency_list.items():
print(f"{node} -> {','.join(neighbors)}")
27 Find an Eulerian Cycle in a Graph
Eulerian Cycle Problem, Find an Eulerian cycle in a graph.
Given: An Eulerian directed graph, in the form of an adjacency list.
Return: An Eulerian cycle in this graph.
27.1 Sample Dataset
0 -> 3
1 -> 0
2 -> 1,6
3 -> 2
4 -> 2
5 -> 4
6 -> 5,8
7 -> 9
8 -> 7
9 -> 6
27.2 Sample Output
6->8->7->9->6->5->4->2->1->0->3->2->6
27.3 Solution
from re import split
from random import choice
from typing import Dict, List, Tuple
def parse_adjacency_list(adjacency_list_text: List[str]) -> Dict[str, List[str]]:
str, List[str]] = {}
adjacency_dict: Dict[for element in adjacency_list_text:
= split(' -> ', element)
node, neighbors = neighbors.split(',')
adjacency_dict[node] return adjacency_dict
def remove_edge(graph: Dict[str, List[str]], source: str, target: str) -> Dict[str, List[str]]:
graph[source].remove(target)if not graph[source]:
del graph[source]
return graph
def find_eulerian_cycle(graph: Dict[str, List[str]]) -> List[str]:
# Form a cycle by randomly walking in the graph
= choice(list(graph.items()))
start_node, edges = choice(edges)
next_node = remove_edge(graph, start_node, next_node)
graph
str] = [start_node, next_node]
cycle: List[= next_node
current_node while current_node != start_node:
= graph[current_node]
edges = choice(edges)
next_node = remove_edge(graph, current_node, next_node)
graph = next_node
current_node
cycle.append(current_node)
while graph:
int, str]] = [(idx, node) for idx, node in enumerate(cycle) if node in graph]
potential_starts: List[Tuple[= choice(potential_starts)
idx, new_start
# Form new_cycle by traversing cycle (starting at new_start) and then randomly walking
= cycle[idx:] + cycle[1:idx + 1]
new_cycle
= choice(graph[new_start])
next_node = remove_edge(graph, new_start, next_node)
graph = next_node
current_node
new_cycle.append(current_node)while current_node != new_start:
= graph[current_node]
edges = choice(edges)
next_node = remove_edge(graph, current_node, next_node)
graph = next_node
current_node
new_cycle.append(current_node)= new_cycle
cycle return cycle
str = """
sample_input: 0 -> 3
1 -> 0
2 -> 1,6
3 -> 2
4 -> 2
5 -> 4
6 -> 5,8
7 -> 9
8 -> 7
9 -> 6
"""
str] = sample_input.strip().split("\n")
input_lines: List[= parse_adjacency_list(input_lines)
adjacency_list
print("->".join(find_eulerian_cycle(adjacency_list)))
28 Find an Eulerian Path in a Graph
Eulerian Path Problem, Find an Eulerian path in a graph.
Given: A directed graph that contains an Eulerian path, where the graph is given in the form of an adjacency list.
Return: An Eulerian path in this graph.
28.1 Sample Dataset
0 -> 2
1 -> 3
2 -> 1
3 -> 0,4
6 -> 3,7
7 -> 8
8 -> 9
9 -> 6
28.2 Sample Output
6->7->8->9->6->3->0->2->1->3->4
28.3 Solution
from re import split
from random import choice
from typing import Dict, List, Tuple
def parse_adjacency_list(adjacency_text: List[str]) -> Dict[str, List[str]]:
str, List[str]] = {}
adjacency_dict: Dict[for line in adjacency_text:
= split(' -> ', line)
node, neighbors = neighbors.split(',')
adjacency_dict[node] return adjacency_dict
def remove_edge(graph: Dict[str, List[str]], source: str, target: str) -> Dict[str, List[str]]:
graph[source].remove(target)if not graph[source]:
del graph[source]
return graph
def find_eulerian_cycle(graph: Dict[str, List[str]]) -> List[str]:
= choice(list(graph.items()))
start_node, edges = choice(edges)
next_node = remove_edge(graph, start_node, next_node)
graph
str] = [start_node, next_node]
cycle: List[= next_node
current_node while current_node != start_node:
= graph[current_node]
edges = choice(edges)
next_node = remove_edge(graph, current_node, next_node)
graph = next_node
current_node
cycle.append(current_node)
while graph:
int, str]] = [(idx, node) for idx, node in enumerate(cycle) if node in graph]
potential_starts: List[Tuple[= choice(potential_starts)
idx, new_start
= cycle[idx:] + cycle[1:idx + 1]
new_cycle
= choice(graph[new_start])
next_node = remove_edge(graph, new_start, next_node)
graph = next_node
current_node
new_cycle.append(current_node)while current_node != new_start:
= graph[current_node]
edges = choice(edges)
next_node = remove_edge(graph, current_node, next_node)
graph = next_node
current_node
new_cycle.append(current_node)= new_cycle
cycle return cycle
def find_eulerian_path(graph: Dict[str, List[str]]) -> List[str]:
str, int] = {}
degree_differences: Dict[for source, targets in graph.items():
= degree_differences.get(source, 0) + len(targets)
degree_differences[source] for target in targets:
= degree_differences.get(target, 0) - 1
degree_differences[target]
= [node for node, diff in degree_differences.items() if diff == -1][0]
start_node = [node for node, diff in degree_differences.items() if diff == 1][0]
end_node
if start_node in graph:
graph[start_node].append(end_node)else:
= [end_node]
graph[start_node]
= find_eulerian_cycle(graph)
cycle for idx, node in enumerate(cycle):
if node == start_node and cycle[(idx + 1) % len(cycle)] == end_node:
return cycle[idx + 1:] + cycle[1:idx + 1]
return cycle # This should never happen if the input is valid
str = """
sample_input: 0 -> 2
1 -> 3
2 -> 1
3 -> 0,4
6 -> 3,7
7 -> 8
8 -> 9
9 -> 6
"""
str] = sample_input.strip().split("\n")
input_lines: List[= parse_adjacency_list(input_lines)
adjacency_list
print("->".join(find_eulerian_path(adjacency_list)))
29 Reconstruct a String from its k-mer Composition
String Reconstruction Problem. Reconstruct a string from its k-mer composition.
Given: An integer k followed by a list of k-mers Patterns.
Return: A string Text with k-mer composition equal to Patterns. (If multiple answers exist, you may return any one.)
29.1 Sample Dataset
4
CTTA
ACCA
TACC
GGCT
GCTT
TTAC
29.2 Sample Output
GGCTTACCA
29.3 Solution
from typing import List, Dict, Tuple
from random import choice
def construct_de_bruijn_graph(kmers: List[str]) -> Dict[str, List[str]]:
str, List[str]] = {}
adjacency_dict: Dict[for kmer in kmers:
= kmer[:-1]
prefix = kmer[1:]
suffix if prefix not in adjacency_dict:
= [suffix]
adjacency_dict[prefix] else:
adjacency_dict[prefix].append(suffix)return adjacency_dict
def remove_edge(graph: Dict[str, List[str]], source: str, target: str) -> Dict[str, List[str]]:
graph[source].remove(target)if not graph[source]:
del graph[source]
return graph
def find_eulerian_cycle(graph: Dict[str, List[str]]) -> List[str]:
= choice(list(graph.items()))
start_node, edges = choice(edges)
next_node = remove_edge(graph, start_node, next_node)
graph
str] = [start_node, next_node]
cycle: List[= next_node
current_node while current_node != start_node:
= graph[current_node]
edges = choice(edges)
next_node = remove_edge(graph, current_node, next_node)
graph = next_node
current_node
cycle.append(current_node)
while graph:
int, str]] = [(idx, node) for idx, node in enumerate(cycle) if node in graph]
potential_starts: List[Tuple[= choice(potential_starts)
idx, new_start
= cycle[idx:] + cycle[1:idx + 1]
new_cycle
= choice(graph[new_start])
next_node = remove_edge(graph, new_start, next_node)
graph = next_node
current_node
new_cycle.append(current_node)while current_node != new_start:
= graph[current_node]
edges = choice(edges)
next_node = remove_edge(graph, current_node, next_node)
graph = next_node
current_node
new_cycle.append(current_node)= new_cycle
cycle return cycle
def find_eulerian_path(graph: Dict[str, List[str]]) -> List[str]:
str, int] = {}
degree_differences: Dict[for source, targets in graph.items():
= degree_differences.get(source, 0) + len(targets)
degree_differences[source] for target in targets:
= degree_differences.get(target, 0) - 1
degree_differences[target]
= [node for node, diff in degree_differences.items() if diff == -1][0]
start_node = [node for node, diff in degree_differences.items() if diff == 1][0]
end_node
if start_node in graph:
graph[start_node].append(end_node)else:
= [end_node]
graph[start_node]
= find_eulerian_cycle(graph)
cycle for idx, node in enumerate(cycle):
if node == start_node and cycle[(idx + 1) % len(cycle)] == end_node:
return cycle[idx + 1:] + cycle[1:idx + 1]
return cycle # This should never happen if the input is valid
def reconstruct_string(kmers: List[str]) -> str:
= construct_de_bruijn_graph(kmers)
adjacency_list = find_eulerian_path(adjacency_list)
path = path[0][:-1]
reconstructed_string for node in path:
+= node[-1]
reconstructed_string return reconstructed_string
str = """
sample_input: 4
CTTA
ACCA
TACC
GGCT
GCTT
TTAC
"""
str] = sample_input.strip().split("\n")
input_lines: List[int = int(input_lines[0])
k: str] = input_lines[1:]
kmers: List[
print(reconstruct_string(kmers))
30 Find a k-Universal Circular String
k-Universal Circular String Problem. Find a k-universal circular binary string.
Given: An integer k.
Return: A k-universal circular string. (If multiple answers exist, you may return any one.)
30.1 Sample Dataset
4
30.2 Sample Output
1111010010110000
30.3 Solution
from typing import List, Dict, Tuple
from random import choice
def construct_de_bruijn_graph(kmers: List[str]) -> Dict[str, List[str]]:
str, List[str]] = {}
adjacency_dict: Dict[for kmer in kmers:
= kmer[:-1]
prefix = kmer[1:]
suffix if prefix not in adjacency_dict:
= [suffix]
adjacency_dict[prefix] else:
adjacency_dict[prefix].append(suffix)return adjacency_dict
def remove_edge(graph: Dict[str, List[str]], source: str, target: str) -> Dict[str, List[str]]:
graph[source].remove(target)if not graph[source]:
del graph[source]
return graph
def find_eulerian_cycle(graph: Dict[str, List[str]]) -> List[str]:
= choice(list(graph.items()))
start_node, edges = choice(edges)
next_node = remove_edge(graph, start_node, next_node)
graph
str] = [start_node, next_node]
cycle: List[= next_node
current_node while current_node != start_node:
= graph[current_node]
edges = choice(edges)
next_node = remove_edge(graph, current_node, next_node)
graph = next_node
current_node
cycle.append(current_node)
while graph:
int, str]] = [(idx, node) for idx, node in enumerate(cycle) if node in graph]
potential_starts: List[Tuple[= choice(potential_starts)
idx, new_start
= cycle[idx:] + cycle[1:idx + 1]
new_cycle
= choice(graph[new_start])
next_node = remove_edge(graph, new_start, next_node)
graph = next_node
current_node
new_cycle.append(current_node)while current_node != new_start:
= graph[current_node]
edges = choice(edges)
next_node = remove_edge(graph, current_node, next_node)
graph = next_node
current_node
new_cycle.append(current_node)= new_cycle
cycle return cycle
def generate_k_universal_circular_string(k: int) -> str:
str] = []
kmers: List[for i in range(2 ** k):
= format(i, f'0{k}b')
kmer
kmers.append(kmer)
= construct_de_bruijn_graph(kmers)
adjacency_list = find_eulerian_cycle(adjacency_list)
cycle
= cycle[:len(cycle) - k + 1]
cycle = cycle[0][:-1]
circular_string for node in cycle:
+= node[-1]
circular_string return circular_string
str = """
sample_input: 4
"""
str] = sample_input.strip().split("\n")
input_lines: List[int = int(input_lines[0])
k:
print(generate_k_universal_circular_string(k))
31 Reconstruct a String from its Paired Composition
String Reconstruction from Read-Pairs Problem. Reconstruct a string from its paired composition.
Given: Integers k and d followed by a collection of paired k-mers PairedReads.
Return: A string Text with (k, d)-mer composition equal to PairedReads. (If multiple answers exist, you may return any one.)
31.1 Sample Dataset
4 2
GAGA|TTGA
TCGT|GATG
CGTG|ATGT
TGGT|TGAG
GTGA|TGTT
GTGG|GTGA
TGAG|GTTG
GGTC|GAGA
GTCG|AGAT
31.2 Sample Output
GTGGTCGTGAGATGTTGA
31.3 Solution
import sys
from collections import defaultdict
from typing import List, Tuple, Dict
from random import choice
def remove_edge(graph: Dict[str, List[str]], source: str, target: str) -> Dict[str, List[str]]:
graph[source].remove(target)if not graph[source]:
del graph[source]
return graph
def find_eulerian_cycle(graph: Dict[str, List[str]]) -> List[str]:
= choice(list(graph.items()))
start_node, edges = choice(edges)
next_node = remove_edge(graph, start_node, next_node)
graph
str] = [start_node, next_node]
cycle: List[= next_node
current_node while current_node != start_node:
= graph[current_node]
edges = choice(edges)
next_node = remove_edge(graph, current_node, next_node)
graph = next_node
current_node
cycle.append(current_node)
while graph:
int, str]] = [(idx, node) for idx, node in enumerate(cycle) if node in graph]
potential_starts: List[Tuple[= choice(potential_starts)
idx, new_start
= cycle[idx:] + cycle[1:idx + 1]
new_cycle
= choice(graph[new_start])
next_node = remove_edge(graph, new_start, next_node)
graph = next_node
current_node
new_cycle.append(current_node)while current_node != new_start:
= graph[current_node]
edges = choice(edges)
next_node = remove_edge(graph, current_node, next_node)
graph = next_node
current_node
new_cycle.append(current_node)= new_cycle
cycle return cycle
def find_eulerian_path(graph: Dict[str, List[str]]) -> List[str]:
str, int] = {}
degree_differences: Dict[for source, targets in graph.items():
= degree_differences.get(source, 0) + len(targets)
degree_differences[source] for target in targets:
= degree_differences.get(target, 0) - 1
degree_differences[target]
= [node for node, diff in degree_differences.items() if diff == -1][0]
start_node = [node for node, diff in degree_differences.items() if diff == 1][0]
end_node
if start_node in graph:
graph[start_node].append(end_node)else:
= [end_node]
graph[start_node]
= find_eulerian_cycle(graph)
cycle for idx, node in enumerate(cycle):
if node == start_node and cycle[(idx + 1) % len(cycle)] == end_node:
return cycle[idx + 1:] + cycle[1:idx + 1]
return cycle # This should never happen if the input is valid
def construct_de_bruijn_graph_paired_reads(paired_reads: List[Tuple[str, str]]) -> Dict[Tuple[str, str], List[Tuple[str, str]]]:
= defaultdict(list)
graph for pair in paired_reads:
0][:-1], pair[1][:-1])].append((pair[0][1:], pair[1][1:]))
graph[(pair[return graph
def string_spelled_by_gapped_patterns(gapped_patterns: List[Tuple[str, str]], k: int, d: int) -> str:
= ''.join(pattern[0][0] for pattern in gapped_patterns[:-1]) + gapped_patterns[-1][0]
prefix_string = ''.join(pattern[1][0] for pattern in gapped_patterns[:-1]) + gapped_patterns[-1][1]
suffix_string
for i in range(k + d + 1, len(prefix_string)):
if prefix_string[i] != suffix_string[i - k - d - 1]:
return "-1"
return prefix_string + suffix_string[len(suffix_string) - k - d - 1:]
def reconstruct_string_from_read_pairs(k: int, d: int, paired_reads: List[Tuple[str, str]]) -> str:
= construct_de_bruijn_graph_paired_reads(paired_reads)
graph = find_eulerian_path(graph)
path return string_spelled_by_gapped_patterns(path, k - 1, d)
str = """
sample_input: 4 2
GAGA|TTGA
TCGT|GATG
CGTG|ATGT
TGGT|TGAG
GTGA|TGTT
GTGG|GTGA
TGAG|GTTG
GGTC|GAGA
GTCG|AGAT
"""
str] = sample_input.strip().split("\n")
input_lines: List[= map(int, input_lines[0].split())
k, d str, str]] = [tuple(line.split("|")) for line in input_lines[1:]]
paired_reads: List[Tuple[
print(reconstruct_string_from_read_pairs(k, d, paired_reads))
32 Generate Contigs from a Collection of Reads
Contig Generation Problem. Generate the contigs from a collection of reads (with imperfect coverage).
Given: A collection of k-mers Patterns.
Return: All contigs in DeBruijn(Patterns). (You may return the strings in any order.)
32.1 Sample Dataset
ATG
ATG
TGT
TGG
CAT
GGA
GAT
AGA
32.2 Sample Output
AGA ATG ATG CAT GAT TGGA TGT
32.3 Solution
from typing import List, Dict, Tuple
from collections import defaultdict
def construct_de_bruijn_graph(kmers: List[str]) -> Dict[str, List[str]]:
str, List[str]] = defaultdict(list)
adjacency_dict: Dict[for kmer in kmers:
= kmer[:-1], kmer[1:]
prefix, suffix
adjacency_dict[prefix].append(suffix)return adjacency_dict
def remove_edge(graph: Dict[str, List[str]], source: str, target: str) -> Dict[str, List[str]]:
graph[source].remove(target)if not graph[source]:
del graph[source]
return graph
def find_maximal_non_branching_paths(graph: Dict[str, List[str]]) -> List[List[str]]:
str]] = []
paths: List[List[str, List[int]] = {}
in_out_degrees: Dict[
# Calculate in and out degrees
for source, targets in graph.items():
if source not in in_out_degrees:
= [0, len(targets)]
in_out_degrees[source] else:
1] += len(targets)
in_out_degrees[source][
for target in targets:
if target not in in_out_degrees:
= [1, 0]
in_out_degrees[target] else:
0] += 1
in_out_degrees[target][
# Find all non-branching paths
for node in list(in_out_degrees):
if in_out_degrees[node] != [1, 1]:
if in_out_degrees[node][1] > 0:
while node in graph:
= graph[node][0]
next_node = [node, next_node]
non_branching_path = remove_edge(graph, node, next_node)
graph while in_out_degrees[next_node] == [1, 1]:
= graph[next_node][0]
following_node
non_branching_path.append(following_node)= remove_edge(graph, next_node, following_node)
graph = following_node
next_node
paths.append(non_branching_path)
# Find isolated cycles
while graph:
= next(iter(graph))
start_node = graph[start_node][0]
current_node = remove_edge(graph, start_node, current_node)
graph = [start_node, current_node]
cycle while current_node != start_node:
= graph[current_node][0]
next_node
cycle.append(next_node)= remove_edge(graph, current_node, next_node)
graph = next_node
current_node
paths.append(cycle)
return paths
def generate_contigs(kmers: List[str]) -> List[str]:
= construct_de_bruijn_graph(kmers)
graph = find_maximal_non_branching_paths(graph)
paths str] = []
contigs: List[for path in paths:
= path[0]
contig for node in path[1:]:
+= node[-1]
contig
contigs.append(contig)return contigs
str = """
sample_input: ATG
ATG
TGT
TGG
CAT
GGA
GAT
AGA
"""
str] = sample_input.strip().split("\n")
kmers: List[= generate_contigs(kmers)
contigs
contigs.sort()print(" ".join(contigs))
33 Construct a String Spelled by a Gapped Genome Path
Gapped Genome Path String Problem. Reconstruct a string from a sequence of (k,d)-mers corresponding to a path in a paired de Bruijn graph.
Given: A sequence of (k, d)-mers (a1|b1),…, (an|bn) such that Suffix(ai|bi) = Prefix(ai+1|bi+1) for all i from 1 to n-1.
Return: A string Text where the i-th k-mer in Text is equal to Suffix(ai|bi) for all i from 1 to n, if such a string exists.
33.1 Sample Dataset
4 2
GACC|GCGC
ACCG|CGCC
CCGA|GCCG
CGAG|CCGG
GAGC|CGGA
33.2 Sample Output
GACCGAGCGCCGGA
33.3 Solution
from typing import List, Tuple
def reconstruct_string_from_gapped_patterns(gapped_patterns: List[Tuple[str, str]], k: int, d: int) -> str:
= ''
prefix_string = ''
suffix_string for i, (prefix, suffix) in enumerate(gapped_patterns):
if i != len(gapped_patterns) - 1:
+= prefix[0]
prefix_string += suffix[0]
suffix_string else:
+= prefix
prefix_string += suffix
suffix_string
for i in range(k + d + 1, len(prefix_string)):
if prefix_string[i] != suffix_string[i - k - d - 1]:
return "-1"
return prefix_string + suffix_string[len(suffix_string) - k - d - 1:]
str = """
sample_input: 4 2
GACC|GCGC
ACCG|CGCC
CCGA|GCCG
CGAG|CCGG
GAGC|CGGA
"""
str] = sample_input.strip().split("\n")
input_lines: List[= map(int, input_lines[0].split())
k, d str, str]] = [tuple(line.split("|")) for line in input_lines[1:]]
gapped_patterns: List[Tuple[
print(reconstruct_string_from_gapped_patterns(gapped_patterns, k - 1, d))
34 Generate All Maximal Non-Branching Paths in a Graph
Maximal Non-Branching Path Problem. Find all maximal non-branching paths in a graph.
Given: The adjacency list of a graph whose nodes are integers.
Return: The collection of all maximal non-branching paths in the graph.
34.1 Sample Dataset
1 -> 2
2 -> 3
3 -> 4,5
6 -> 7
7 -> 6
34.2 Sample Output
1 -> 2 -> 3
3 -> 4
3 -> 5
6 -> 7 -> 6
34.3 Solution
from typing import List, Dict, Tuple
from re import split
def parse_adjacency_list(adjacency_text: List[str]) -> Dict[str, List[str]]:
str, List[str]] = {}
adjacency_dict: Dict[for line in adjacency_text:
= split(' -> ', line)
source, targets = targets.split(',')
adjacency_dict[source] return adjacency_dict
def remove_edge(graph: Dict[str, List[str]], source: str, target: str) -> Dict[str, List[str]]:
graph[source].remove(target)if not graph[source]:
del graph[source]
return graph
def find_maximal_non_branching_paths(graph: Dict[str, List[str]]) -> List[List[str]]:
str]] = []
paths: List[List[str, List[int]] = {}
in_out_degrees: Dict[
# Calculate in and out degrees
for node, neighbors in graph.items():
if node not in in_out_degrees:
= [0, len(neighbors)]
in_out_degrees[node] else:
1] += len(neighbors)
in_out_degrees[node][
for neighbor in neighbors:
if neighbor not in in_out_degrees:
= [1, 0]
in_out_degrees[neighbor] else:
0] += 1
in_out_degrees[neighbor][
# Find all non-branching paths
for node in list(in_out_degrees):
if in_out_degrees[node] != [1, 1]:
if in_out_degrees[node][1] > 0:
while node in graph:
= graph[node][0]
next_node = [node, next_node]
non_branching_path = remove_edge(graph, node, next_node)
graph while in_out_degrees[next_node] == [1, 1]:
= graph[next_node][0]
following_node
non_branching_path.append(following_node)= remove_edge(graph, next_node, following_node)
graph = following_node
next_node
paths.append(non_branching_path)
# Find isolated cycles
while graph:
= next(iter(graph))
start_node = graph[start_node][0]
current_node = remove_edge(graph, start_node, current_node)
graph = [start_node, current_node]
cycle while current_node != start_node:
= graph[current_node][0]
next_node
cycle.append(next_node)= remove_edge(graph, current_node, next_node)
graph = next_node
current_node
paths.append(cycle)
return paths
str = """
sample_input: 1 -> 2
2 -> 3
3 -> 4,5
6 -> 7
7 -> 6
"""
str] = sample_input.strip().split("\n")
input_lines: List[= parse_adjacency_list(input_lines)
adjacency_list
= find_maximal_non_branching_paths(adjacency_list)
result for path in result:
print(" -> ".join(path))
35 Translate an RNA String into an Amino Acid String
Protein Translation Problem. Translate an RNA string into an amino acid string.
Given: An RNA string Pattern.
Return: The translation of Pattern into an amino acid string Peptide.
35.1 Sample Dataset
AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA
35.2 Sample Output
MAMAPRTEINSTRING
35.3 Solution
from typing import Dict, List
def translate_rna_to_protein(rna_sequence: str) -> str:
# RNA 코돈을 아미노산으로 변환하는 딕셔너리
str, str] = {
codon_table: Dict['UUU': 'F', 'UUC': 'F', 'UUA': 'L', 'UUG': 'L',
'UCU': 'S', 'UCC': 'S', 'UCA': 'S', 'UCG': 'S',
'UAU': 'Y', 'UAC': 'Y', 'UAA': '*', 'UAG': '*',
'UGU': 'C', 'UGC': 'C', 'UGA': '*', 'UGG': 'W',
'CUU': 'L', 'CUC': 'L', 'CUA': 'L', 'CUG': 'L',
'CCU': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAU': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGU': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'AUU': 'I', 'AUC': 'I', 'AUA': 'I', 'AUG': 'M',
'ACU': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAU': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGU': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GUU': 'V', 'GUC': 'V', 'GUA': 'V', 'GUG': 'V',
'GCU': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAU': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGU': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
}
str = ""
protein_sequence:
# RNA 서열을 3개의 뉴클레오티드(코돈)씩 나누어 처리
for i in range(0, len(rna_sequence), 3):
str = rna_sequence[i:i+3]
codon:
# 코돈이 3개의 뉴클레오티드로 완전하지 않으면 중단
if len(codon) != 3:
break
# 코돈을 아미노산으로 변환
str = codon_table.get(codon, '')
amino_acid:
# 종결 코돈('*')을 만나면 번역 중단
if amino_acid == '*':
break
+= amino_acid
protein_sequence
return protein_sequence
# 입력 RNA 서열
str = """
sample_input: AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA
"""
str = ''.join(sample_input.strip().split())
rna_sequence:
# RNA를 단백질로 번역
str = translate_rna_to_protein(rna_sequence)
protein:
print(protein)
36 Find Substrings of a Genome Encoding a Given Amino Acid String
Peptide Encoding Problem, Find substrings of a genome encoding a given amino acid sequence.
Given: A DNA string Text and an amino acid string Peptide.
Return: All substrings of Text encoding Peptide (if any such substrings exist).
36.1 Sample Dataset
ATGGCCATGGCCCCCAGAACTGAGATCAATAGTACCCGTATTAACGGGTGA
MA
36.2 Sample Output
ATGGCC
GGCCAT
ATGGCC
36.3 Solution
from typing import Dict, List
def translate_rna_to_protein(rna_sequence: str) -> str:
str, str] = {
codon_to_amino_acid: Dict['UUU': 'F', 'UUC': 'F', 'UUA': 'L', 'UUG': 'L',
'UCU': 'S', 'UCC': 'S', 'UCA': 'S', 'UCG': 'S',
'UAU': 'Y', 'UAC': 'Y', 'UAA': '*', 'UAG': '*',
'UGU': 'C', 'UGC': 'C', 'UGA': '*', 'UGG': 'W',
'CUU': 'L', 'CUC': 'L', 'CUA': 'L', 'CUG': 'L',
'CCU': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAU': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGU': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'AUU': 'I', 'AUC': 'I', 'AUA': 'I', 'AUG': 'M',
'ACU': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAU': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGU': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GUU': 'V', 'GUC': 'V', 'GUA': 'V', 'GUG': 'V',
'GCU': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAU': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGU': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
}
str] = []
protein_sequence: List[
for i in range(0, len(rna_sequence), 3):
str = rna_sequence[i:i+3]
codon:
if len(codon) != 3:
break
str = codon_to_amino_acid.get(codon, '')
amino_acid:
if amino_acid == '*':
break
protein_sequence.append(amino_acid)
return ''.join(protein_sequence)
def reverse_complement(dna_sequence: str) -> str:
return dna_sequence[::-1].translate(str.maketrans("ACGT", "TGCA"))
def dna_to_rna(dna_sequence: str) -> str:
return dna_sequence.replace("T", "U")
def find_peptide_encoding_substrings(dna_sequence: str, peptide: str) -> List[str]:
int = len(peptide) * 3
substring_length: str] = []
encoding_substrings: List[
for i in range(len(dna_sequence) - substring_length + 1):
str = dna_sequence[i:i + substring_length]
dna_substring: str = reverse_complement(dna_substring)
reverse_complement_substring:
str = dna_to_rna(dna_substring)
rna_substring: str = dna_to_rna(reverse_complement_substring)
reverse_complement_rna:
if (translate_rna_to_protein(rna_substring) == peptide or
== peptide):
translate_rna_to_protein(reverse_complement_rna)
encoding_substrings.append(dna_substring)
return encoding_substrings
str = """
sample_input: ATGGCCATGGCCCCCAGAACTGAGATCAATAGTACCCGTATTAACGGGTGA
MA
"""
= sample_input.strip().split('\n')
dna_sequence, peptide str] = find_peptide_encoding_substrings(dna_sequence, peptide)
result: List[print("\n".join(result))
37 Generate the Theoretical Spectrum of a Cyclic Peptide
Generating Theoretical Spectrum Problem, Generate the theoretical spectrum of a cyclic peptide.
Given: An amino acid string Peptide.
Return: Cyclospectrum(Peptide).
37.1 Sample Dataset
LEQN
37.2 Sample Output
0 113 114 128 129 227 242 242 257 355 356 370 371 484
37.3 Solution
from typing import Dict, List
str, int] = {
AMINO_ACID_MASSES: Dict['A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131,
'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163
}
def calculate_cyclospectrum(peptide: str) -> List[int]:
int = sum(AMINO_ACID_MASSES[amino_acid] for amino_acid in peptide)
total_mass: int] = [0, total_mass]
spectrum: List[str = peptide + peptide
circular_peptide:
for subpeptide_length in range(1, len(peptide)):
for start_index in range(len(peptide)):
str = circular_peptide[start_index:start_index + subpeptide_length]
subpeptide: int = sum(AMINO_ACID_MASSES[amino_acid] for amino_acid in subpeptide)
subpeptide_mass:
spectrum.append(subpeptide_mass)
return sorted(spectrum)
str = """
sample_input: LEQN
"""
str = sample_input.strip()
input_peptide:
int] = calculate_cyclospectrum(input_peptide)
result: List[print(" ".join(map(str, result)))
38 Compute the Number of Peptides of Given Total Mass
Counting Peptides with Given Mass Problem, Compute the number of peptides of given total mass.
Given: An integer m.
Return: The number of linear peptides having integer mass m.
38.1 Sample Dataset
1024
38.2 Sample Output
14712706211
38.3 Solution
from collections import defaultdict
from typing import List, Dict
int] = [57, 71, 87, 97, 99, 101, 103, 113, 114, 115, 128, 129, 131, 137, 147, 156, 163, 186]
AMINO_ACID_MASSES: List[int = min(AMINO_ACID_MASSES)
LIGHTEST_AMINO_ACID:
def count_possible_peptides(target_mass: int) -> int:
int, int] = defaultdict(int)
peptide_count: Dict[
for current_mass in range(LIGHTEST_AMINO_ACID, target_mass + 1):
= sum(1 for amino_acid_mass in AMINO_ACID_MASSES if amino_acid_mass == current_mass)
peptide_count[current_mass]
for amino_acid_mass in AMINO_ACID_MASSES:
if current_mass >= amino_acid_mass:
+= peptide_count[current_mass - amino_acid_mass]
peptide_count[current_mass]
return peptide_count[target_mass]
str = """
sample_input: 1024
"""
int = int(sample_input.strip())
target_peptide_mass: int = count_possible_peptides(target_peptide_mass)
result: print(result)
39 Find a Cyclic Peptide with Theoretical Spectrum Matching an Ideal Spectrum
Cyclopeptide Sequencing Problem, Given an ideal experimental spectrum, find a cyclic peptide whose theoretical spectrum matches the experimental spectrum.
Given: A collection of (possibly repeated) integers Spectrum corresponding to an ideal experimental spectrum.
Return: Every amino acid string Peptide such that Cyclospectrum(Peptide) = Spectrum (if such a string exists).
39.1 Sample Dataset
0 113 128 186 241 299 314 427
39.2 Sample Output
113-128-186 113-186-128 186-128-113 128-186-113 186-113-128 128-113-186
39.3 Solution
from typing import List, Set
int] = [57, 71, 87, 97, 99, 101, 103, 113, 114, 115, 128, 129, 131, 137, 147, 156, 163, 186]
AMINO_ACID_MASSES: List[
def calculate_cyclospectrum(peptide: List[int]) -> List[int]:
int] = [0, sum(peptide)]
spectrum: List[int] = peptide + peptide
extended_peptide: List[for k in range(1, len(peptide)):
for i in range(len(peptide)):
int] = extended_peptide[i:i + k]
subpeptide: List[sum(subpeptide))
spectrum.append(
spectrum.sort()return spectrum
def calculate_linear_spectrum(peptide: List[int]) -> List[int]:
int] = [0]
prefix_mass: List[for mass in peptide:
-1] + mass)
prefix_mass.append(prefix_mass[int] = [0]
linear_spectrum: List[for i in range(len(peptide)):
for j in range(i + 1, len(peptide) + 1):
- prefix_mass[i])
linear_spectrum.append(prefix_mass[j]
linear_spectrum.sort()return linear_spectrum
def expand_peptides(peptides: List[List[int]]) -> List[List[int]]:
int]] = []
expanded_peptides: List[List[for peptide in peptides:
for mass in AMINO_ACID_MASSES:
+ [mass])
expanded_peptides.append(peptide return expanded_peptides
def is_consistent(peptide: List[int], spectrum: List[int]) -> bool:
if sum(peptide) > spectrum[-1] - AMINO_ACID_MASSES[0]:
return False
int] = calculate_linear_spectrum(peptide)
peptide_spectrum: List[return all(mass in spectrum for mass in peptide_spectrum)
def cyclopeptide_sequencing(spectrum: List[int]) -> Set[str]:
int]] = [[]]
candidate_peptides: List[List[str] = set()
result: Set[
while candidate_peptides:
= expand_peptides(candidate_peptides)
candidate_peptides for peptide in candidate_peptides[:]:
if sum(peptide) == spectrum[-1]:
if calculate_cyclospectrum(peptide) == spectrum:
"-".join(map(str, peptide)))
result.add(
candidate_peptides.remove(peptide)elif not is_consistent(peptide, spectrum):
candidate_peptides.remove(peptide)
return result
str = """
sample_input: 0 113 128 186 241 299 314 427
"""
int] = [int(x) for x in sample_input.strip().split()]
input_spectrum: List[
str] = cyclopeptide_sequencing(input_spectrum)
result: Set[print(" ".join(result))
40 Compute the Score of a Cyclic Peptide Against a Spectrum
Cyclic Peptide Scoring Problem, Compute the score of a cyclic peptide against a spectrum.
Given: An amino acid string Peptide and a collection of integers Spectrum.
Return: The score of Peptide against Spectrum, Score(Peptide, Spectrum).
40.1 Sample Dataset
NQEL
0 99 113 114 128 227 257 299 355 356 370 371 484
40.2 Sample Output
11
40.3 Solution
from typing import Dict, List
str, int] = {
AMINO_ACID_MASSES: Dict['A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131,
'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163
}
def calculate_cyclospectrum(peptide: str) -> List[int]:
int = sum(AMINO_ACID_MASSES[aa] for aa in peptide)
total_mass: int] = [0, total_mass]
spectrum: List[str = peptide + peptide
extended_peptide:
for length in range(1, len(peptide)):
for start in range(len(peptide)):
str = extended_peptide[start:start + length]
subpeptide: int = sum(AMINO_ACID_MASSES[aa] for aa in subpeptide)
subpeptide_mass:
spectrum.append(subpeptide_mass)
spectrum.sort()return spectrum
def calculate_score(peptide: str, experimental_spectrum: List[int]) -> int:
int] = calculate_cyclospectrum(peptide)
theoretical_spectrum: List[int = 0
score: set = set(theoretical_spectrum + experimental_spectrum)
unique_masses:
for mass in unique_masses:
+= min(theoretical_spectrum.count(mass), experimental_spectrum.count(mass))
score
return score
str = """
sample_input: NQEL
0 99 113 114 128 227 257 299 355 356 370 371 484
"""
str] = sample_input.strip().split("\n")
input_lines: List[str = input_lines[0]
input_peptide: int] = [int(x) for x in input_lines[1].split()]
input_spectrum: List[
int = calculate_score(input_peptide, input_spectrum)
result: print(result)
41 Implement LeaderboardCyclopeptideSequencing
Implement LeaderboardCyclopeptideSequencing
Given: An integer N and a collection of integers Spectrum.
Return: LeaderPeptide after running LeaderboardCyclopeptideSequencing(Spectrum, N).
41.1 Sample Dataset
10
0 71 113 129 147 200 218 260 313 331 347 389 460
41.2 Sample Output
113-147-71-129
41.3 Solution
from typing import List, Set, Dict
int] = [57, 71, 87, 97, 99, 101, 103, 113, 114, 115, 128, 129, 131, 137, 147, 156, 163, 186]
AMINO_ACID_MASSES: List[
def expand_peptides(peptides: List[List[int]]) -> List[List[int]]:
int]] = []
expanded_peptides: List[List[for peptide in peptides:
for mass in AMINO_ACID_MASSES:
+ [mass])
expanded_peptides.append(peptide return expanded_peptides
def calculate_cyclospectrum(peptide: List[int]) -> List[int]:
int = sum(peptide)
total_mass: int] = [0, total_mass]
spectrum: List[int] = peptide + peptide
extended_peptide: List[
for length in range(1, len(peptide)):
for start in range(len(peptide)):
int] = extended_peptide[start:start + length]
subpeptide: List[int = sum(subpeptide)
subpeptide_mass:
spectrum.append(subpeptide_mass)
spectrum.sort()return spectrum
def is_consistent(peptide: List[int], spectrum: List[int]) -> bool:
int] = calculate_cyclospectrum(peptide)
peptide_spectrum: List[return all(peptide_spectrum.count(mass) <= spectrum.count(mass) for mass in set(peptide_spectrum))
def cyclopeptide_sequencing(spectrum: List[int]) -> Set[str]:
str] = set()
result: Set[int]] = [[]]
candidate_peptides: List[List[while candidate_peptides:
= expand_peptides(candidate_peptides)
candidate_peptides for peptide in candidate_peptides[:]:
if sum(peptide) == spectrum[-1]:
if calculate_cyclospectrum(peptide) == spectrum:
"-".join(map(str, peptide)))
result.add(
candidate_peptides.remove(peptide)elif not is_consistent(peptide, spectrum):
candidate_peptides.remove(peptide)return result
def calculate_score(peptide: List[int], spectrum: List[int]) -> int:
int] = calculate_cyclospectrum(peptide)
peptide_spectrum: List[int = 0
score: int] = set(peptide_spectrum + spectrum)
unique_masses: Set[for mass in unique_masses:
+= min(peptide_spectrum.count(mass), spectrum.count(mass))
score return score
def trim_leaderboard(leaderboard: List[List[int]], spectrum: List[int], n: int) -> List[List[int]]:
if len(leaderboard) <= n:
return leaderboard
int, int] = {}
scores: Dict[for i, peptide in enumerate(leaderboard):
= calculate_score(peptide, spectrum)
scores[i]
int] = sorted(scores.values(), reverse=True)
sorted_scores: List[int = sorted_scores[n - 1] if n <= len(sorted_scores) else sorted_scores[-1]
threshold:
return [leaderboard[idx] for idx, score in scores.items() if score >= threshold]
def leaderboard_cyclopeptide_sequencing(spectrum: List[int], n: int) -> List[int]:
int]] = [[]]
leaderboard: List[List[int] = []
leader_peptide: List[
while leaderboard:
= expand_peptides(leaderboard)
leaderboard for peptide in leaderboard[:]:
if sum(peptide) == spectrum[-1]:
if calculate_score(peptide, spectrum) > calculate_score(leader_peptide, spectrum):
= peptide
leader_peptide elif sum(peptide) > spectrum[-1]:
leaderboard.remove(peptide)= trim_leaderboard(leaderboard, spectrum, n)
leaderboard return leader_peptide
str = """
sample_input: 10
0 71 113 129 147 200 218 260 313 331 347 389 460
"""
str] = sample_input.strip().split("\n")
input_lines: List[int = int(input_lines[0])
n: int] = [int(x) for x in input_lines[1].split()]
spectrum: List[
int] = leaderboard_cyclopeptide_sequencing(spectrum, n)
result: List[print("-".join(map(str, result)))
42 Generate the Convolution of a Spectrum
Spectral Convolution Problem, Compute the convolution of a spectrum.
Given: A collection of integers Spectrum.
Return: The list of elements in the convolution of Spectrum in decreasing order of their multiplicities. If an element has multiplicity k, it should appear exactly k times.
42.1 Sample Dataset
0 137 186 323
42.2 Sample Output
137 137 186 186 323 49
42.3 Solution
from typing import List, Dict
def calculate_spectrum_convolution(spectrum: List[int]) -> List[int]:
spectrum.sort()int] = []
convolution_list: List[
for i in range(len(spectrum) - 1):
for j in range(i + 1, len(spectrum)):
int = spectrum[j] - spectrum[i]
mass_difference: if mass_difference != 0:
convolution_list.append(mass_difference)
int, int] = {}
frequency_dict: Dict[for mass in set(convolution_list):
= convolution_list.count(mass)
frequency_dict[mass]
int] = sorted(frequency_dict, key=frequency_dict.get, reverse=True)
sorted_masses: List[
int] = []
result: List[for mass in sorted_masses:
* frequency_dict[mass])
result.extend([mass]
return result
str = """
sample_input: 0 137 186 323
"""
int] = [int(x) for x in sample_input.strip().split()]
input_spectrum: List[
int] = calculate_spectrum_convolution(input_spectrum)
convolution_result: List[print(" ".join(map(str, convolution_result)))
43 Implement ConvolutionCyclopeptideSequencing
Implement ConvolutionCyclopeptideSequencing.
Given: An integer M, an integer N, and a collection of (possibly repeated) integers Spectrum.
Return: A cyclic peptide LeaderPeptide with amino acids taken only from the top M elements (and ties) of the convolution of Spectrum that fall between 57 and 200, and where the size of Leaderboard is restricted to the top N (and ties).
43.1 Sample Dataset
20
60
57 57 71 99 129 137 170 186 194 208 228 265 285 299 307 323 356 364 394 422 493
43.2 Sample Output
99-71-137-57-72-57
43.3 Solution
from typing import List, Dict, Tuple
def calculate_spectrum_convolution(spectrum: List[int]) -> List[int]:
spectrum.sort()int] = []
convolution_list: List[for i in range(len(spectrum) - 1):
for j in range(i, len(spectrum)):
if spectrum[j] - spectrum[i] != 0:
- spectrum[i])
convolution_list.append(spectrum[j]
int, int] = {}
frequency_dict: Dict[for mass in set(convolution_list):
= convolution_list.count(mass)
frequency_dict[mass]
int] = [k for k, _ in sorted(frequency_dict.items(), key=lambda item: item[1], reverse=True)]
sorted_masses: List[int] = []
result: List[for mass in sorted_masses:
+= [mass] * frequency_dict[mass]
result return result
def trim_leaderboard(leaderboard: List[List[int]], spectrum: List[int], n: int) -> List[List[int]]:
if len(leaderboard) <= n:
return leaderboard
int, int] = {}
scores: Dict[for i, peptide in enumerate(leaderboard):
= calculate_score(peptide, spectrum)
scores[i]
int] = sorted(scores.values(), reverse=True)
sorted_scores: List[int = sorted_scores[n - 1]
threshold:
return [leaderboard[idx] for idx, score in scores.items() if score >= threshold]
def calculate_cyclospectrum(peptide: List[int]) -> List[int]:
int] = [0, sum(peptide)]
spectrum: List[int] = peptide + peptide
extended_peptide: List[for k in range(1, len(peptide)):
for i in range(len(peptide)):
int] = extended_peptide[i:i + k]
subpeptide: List[sum(subpeptide))
spectrum.append(
spectrum.sort()return spectrum
def calculate_score(peptide: List[int], spectrum: List[int]) -> int:
int] = calculate_cyclospectrum(peptide)
peptide_spectrum: List[int = 0
score: set = set(peptide_spectrum + spectrum)
unique_masses: for mass in unique_masses:
+= min(peptide_spectrum.count(mass), spectrum.count(mass))
score return score
def find_top_masses(spectrum: List[int], m: int) -> List[int]:
int] = calculate_spectrum_convolution(spectrum)
convolution: List[int] = [x for x in convolution if 57 <= x <= 200]
filtered_convolution: List[
int, int] = {}
frequency_dict: Dict[for mass in set(filtered_convolution):
= filtered_convolution.count(mass)
frequency_dict[mass]
int, int]] = sorted(frequency_dict.items(), key=lambda kv: kv[1], reverse=True)
sorted_elements: List[Tuple[int] = [mass for mass, freq in sorted_elements if freq >= sorted_elements[m - 1][1]]
top_masses: List[
top_masses.sort()return top_masses
def expand_peptides(peptides: List[List[int]], masses: List[int]) -> List[List[int]]:
int]] = []
expanded_peptides: List[List[for peptide in peptides:
for mass in masses:
+ [mass])
expanded_peptides.append(peptide return expanded_peptides
def convolution_cyclopeptide_sequencing(spectrum: List[int], m: int, n: int) -> List[int]:
int] = find_top_masses(spectrum, m)
masses: List[int]] = [[]]
leaderboard: List[List[int] = []
leader_peptide: List[
while leaderboard:
= expand_peptides(leaderboard, masses)
leaderboard for peptide in leaderboard[:]:
if sum(peptide) == spectrum[-1]:
if calculate_score(peptide, spectrum) > calculate_score(leader_peptide, spectrum):
= peptide
leader_peptide elif sum(peptide) > spectrum[-1]:
leaderboard.remove(peptide)= trim_leaderboard(leaderboard, spectrum, n)
leaderboard return leader_peptide
str = """
sample_input: 20
60
57 57 71 99 129 137 170 186 194 208 228 265 285 299 307 323 356 364 394 422 493
"""
str] = sample_input.strip().split("\n")
input_lines: List[int = int(input_lines[0])
m: int = int(input_lines[1])
n: int] = [int(x) for x in input_lines[2].split()]
spectrum: List[
int] = convolution_cyclopeptide_sequencing(spectrum, m, n)
result: List[print("-".join(map(str, result)))
44 Generate the Theoretical Spectrum of a Linear Peptide
Linear Spectrum Problem, Generate the ideal linear spectrum of a peptide.
Given: An amino acid string Peptide.
Return: The linear spectrum of Peptide.
44.1 Sample Dataset
NQEL
44.2 Sample Output
0 113 114 128 129 242 242 257 370 371 484
44.3 Solution
from typing import List, Dict
str, int] = {
AMINO_ACID_MASSES: Dict['A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137,
'K': 128, 'M': 131, 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156,
'T': 101, 'W': 186, 'V': 99, 'Y': 163
}
def calculate_linear_spectrum(peptide: str) -> List[int]:
int] = [0]
prefix_masses: List[
for amino_acid in peptide:
= prefix_masses[-1] + AMINO_ACID_MASSES[amino_acid]
current_mass
prefix_masses.append(current_mass)
int] = [0]
linear_spectrum: List[
for i in range(len(peptide)):
for j in range(i + 1, len(peptide) + 1):
= prefix_masses[j] - prefix_masses[i]
subpeptide_mass
linear_spectrum.append(subpeptide_mass)
return sorted(linear_spectrum)
# Sample input
str = "NQEL"
sample_peptide:
# Calculate and print the linear spectrum
int] = calculate_linear_spectrum(sample_peptide)
result: List[print(" ".join(map(str, result)))
45 Compute the Score of a Linear Peptide
Linear Peptide Scoring Problem, Compute the score of a linear peptide with respect to a spectrum.
Given: An amino acid string Peptide and a collection of integers LinearSpectrum.
Return: The linear score of Peptide against Spectrum, LinearScore(Peptide, Spectrum).
45.1 Sample Dataset
NQEL
0 99 113 114 128 227 257 299 355 356 370 371 484
45.2 Sample Output
8
45.3 Solution
from typing import List, Dict
str, int] = {
AMINO_ACID_MASSES: Dict['A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137,
'K': 128, 'M': 131, 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156,
'T': 101, 'W': 186, 'V': 99, 'Y': 163
}
def calculate_linear_spectrum(peptide: str) -> List[int]:
int] = [0]
prefix_masses: List[
for amino_acid in peptide:
= prefix_masses[-1] + AMINO_ACID_MASSES[amino_acid]
current_mass
prefix_masses.append(current_mass)
int] = [0]
linear_spectrum: List[
for i in range(len(peptide)):
for j in range(i + 1, len(peptide) + 1):
= prefix_masses[j] - prefix_masses[i]
subpeptide_mass
linear_spectrum.append(subpeptide_mass)
return sorted(linear_spectrum)
def calculate_linear_score(peptide: str, experimental_spectrum: List[int]) -> int:
int] = calculate_linear_spectrum(peptide)
theoretical_spectrum: List[int = 0
score: set = set(theoretical_spectrum + experimental_spectrum)
unique_masses:
for mass in unique_masses:
+= min(theoretical_spectrum.count(mass), experimental_spectrum.count(mass))
score
return score
# Sample input
str = """
sample_input: NQEL
0 99 113 114 128 227 257 299 355 356 370 371 484
"""
str] = sample_input.strip().split("\n")
input_lines: List[str = input_lines[0]
input_peptide: int] = [int(x) for x in input_lines[1].split()]
input_spectrum: List[
# Calculate and print the linear score
int = calculate_linear_score(input_peptide, input_spectrum)
result: print(result)
46 Trim a Peptide Leaderboard
Trim Problem, Trim a leaderboard of peptides.
Given: A leaderboard of linear peptides Leaderboard, a linear spectrum Spectrum, and an integer N.
Return: The top N peptides from Leaderboard scored against Spectrum. Remember to use LinearScore.
46.1 Sample Dataset
LAST ALST TLLT TQAS
0 71 87 101 113 158 184 188 259 271 372
2
46.2 Sample Output
LAST ALST
46.3 Solution
from typing import List, Dict
str, int] = {
AMINO_ACID_MASSES: Dict['A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137,
'K': 128, 'M': 131, 'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156,
'T': 101, 'W': 186, 'V': 99, 'Y': 163
}
def calculate_linear_spectrum(peptide: str) -> List[int]:
int] = [0]
prefix_masses: List[
for amino_acid in peptide:
= prefix_masses[-1] + AMINO_ACID_MASSES[amino_acid]
current_mass
prefix_masses.append(current_mass)
int] = [0]
linear_spectrum: List[
for i in range(len(peptide)):
for j in range(i + 1, len(peptide) + 1):
= prefix_masses[j] - prefix_masses[i]
subpeptide_mass
linear_spectrum.append(subpeptide_mass)
return sorted(linear_spectrum)
def calculate_linear_score(peptide: str, experimental_spectrum: List[int]) -> int:
int] = calculate_linear_spectrum(peptide)
theoretical_spectrum: List[int = 0
score: set = set(theoretical_spectrum + experimental_spectrum)
unique_masses:
for mass in unique_masses:
+= min(theoretical_spectrum.count(mass), experimental_spectrum.count(mass))
score
return score
def trim_leaderboard(leaderboard: List[str], spectrum: List[int], n: int) -> List[str]:
if len(leaderboard) <= n:
return leaderboard
int, int] = {}
peptide_scores: Dict[for i, peptide in enumerate(leaderboard):
= calculate_linear_score(peptide, spectrum)
peptide_scores[i]
int] = sorted(peptide_scores.values(), reverse=True)
sorted_scores: List[int = sorted_scores[n - 1]
score_threshold:
return [leaderboard[idx] for idx, score in peptide_scores.items() if score >= score_threshold]
# Sample input
str = """
sample_input: LAST ALST TLLT TQAS
0 71 87 101 113 158 184 188 259 271 372
2
"""
str] = sample_input.strip().split("\n")
input_lines: List[str] = input_lines[0].split()
input_leaderboard: List[int] = [int(x) for x in input_lines[1].split()]
input_spectrum: List[int = int(input_lines[2])
input_n:
str] = trim_leaderboard(input_leaderboard, input_spectrum, input_n)
result: List[print(" ".join(result))
47 Solve the Turnpike Problem
Turnpike Problem. Given all pairwise distances between points on a line segment, reconstruct the positions of those points.
Given: A collection of integers L.
Return: A set A such that \(∆A\) = L.
47.1 Sample Dataset
-10 -8 -7 -6 -5 -4 -3 -3 -2 -2 0 0 0 0 0 2 2 3 3 4 5 6 7 8 10
47.2 Sample Output
0 2 4 7 10
47.3 Solution
from typing import List, Set, Optional
def calculate_absolute_differences(set_a: Set[int], set_b: Set[int]) -> List[int]:
return [abs(a - b) for a in set_a for b in set_b]
def is_multiset_subset(subset: List[int], superset: List[int]) -> bool:
return all(subset.count(elem) <= superset.count(elem) for elem in set(subset))
def multiset_difference(set_a: List[int], set_b: List[int]) -> List[int]:
int] = []
difference: List[int] = set(set_a)
unique_elements: Set[for elem in unique_elements:
= set_a.count(elem) - set_b.count(elem)
count_difference if count_difference > 0:
* count_difference)
difference.extend([elem] return sorted(difference)
def place_elements(distances: List[int]) -> Optional[Set[int]]:
if not distances:
return placed_elements
int = distances[-1]
current_distance:
# Try placing on the left
int] = calculate_absolute_differences({current_distance}, placed_elements)
left_differences: List[if is_multiset_subset(left_differences, distances):
placed_elements.add(current_distance)int] = multiset_difference(distances, left_differences)
remaining_distances_left: List[int]] = place_elements(remaining_distances_left)
left_result: Optional[Set[if left_result:
return left_result
placed_elements.remove(current_distance)
# Try placing on the right
int] = calculate_absolute_differences({total_width - current_distance}, placed_elements)
right_differences: List[if is_multiset_subset(right_differences, distances):
- current_distance)
placed_elements.add(total_width int] = multiset_difference(distances, right_differences)
remaining_distances_right: List[int]] = place_elements(remaining_distances_right)
right_result: Optional[Set[if right_result:
return right_result
- current_distance)
placed_elements.remove(total_width
return None
# Sample input
str = """
sample_input: -10 -8 -7 -6 -5 -4 -3 -3 -2 -2 0 0 0 0 0 2 2 3 3 4 5 6 7 8 10
"""
int] = [int(x) for x in sample_input.strip().split()]
input_distances: List[int] = [x for x in input_distances if x > 0]
positive_distances: List[
int = positive_distances.pop(-1)
total_width: int] = {0, total_width}
placed_elements: Set[
int]] = place_elements(positive_distances)
result: Optional[Set[print(" ".join(map(str, sorted(result))))
48 Find the Minimum Number of Coins Needed to Make Change
The Change Problem, Find the minimum number of coins needed to make change
Given: An integer money and an array Coins of positive integers.
Return: The minimum number of coins with denominations Coins that changes money.
48.1 Sample Dataset
40
1,5,10,20,25,50
48.2 Sample Output
2
48.3 Solution
from typing import List
def min_coins_for_change(target_amount: int, available_coins: List[int]) -> int:
= [0]
min_coins_needed for current_amount in range(1, target_amount + 1):
+ 1)
min_coins_needed.append(target_amount for coin in available_coins:
if current_amount >= coin:
= min_coins_needed[current_amount - coin] + 1
coins_for_current if coins_for_current < min_coins_needed[current_amount]:
= coins_for_current
min_coins_needed[current_amount] return min_coins_needed[target_amount]
= """
sample_input 40
1,5,10,20,25,50
"""
= sample_input.strip().split("\n")
input_lines = int(input_lines[0])
target_amount = [int(x) for x in input_lines[1].split(",")]
available_coins
print(min_coins_for_change(target_amount, available_coins))
49 Find the Length of a Longest Path in a Manhattan-like Grid
Length of a Longest Path in the Manhattan Tourist Problem. Find the length of a longest path in a rectangular city.
Given: Integers n and m, followed by an n × (m+1) matrix Down and an (n+1) × m matrix Right. The two matrices are separated by the”-“symbol.
Return: The length of a longest path from source (0, 0) to sink (n, m) in the n × m rectangular grid whose edges are defined by the matrices Down and Right.
49.1 Sample Dataset
4 4
1 0 2 4 3
4 6 5 2 1
4 4 5 2 1
5 6 8 5 3
-
3 2 4 0
3 2 4 2
0 7 3 3
3 3 0 2
1 3 2 2
49.2 Sample Output
34
49.3 Solution
from typing import List, Tuple
def parse_manhattan_tourist_input(input_text: str) -> Tuple[int, int, List[List[int]], List[List[int]]]:
= input_text.strip().split('\n')
lines = map(int, lines[0].split())
rows, cols
= [[0] * (cols + 1) for _ in range(rows)]
down_weights for i in range(rows):
= list(map(int, lines[i + 1].split()))
line for j in range(cols + 1):
= line[j]
down_weights[i][j]
= rows + 1
separator_index = [[0] * cols for _ in range(rows + 1)]
right_weights for i in range(rows + 1):
= list(map(int, lines[i + separator_index + 1].split()))
line for j in range(cols):
= line[j]
right_weights[i][j]
return rows, cols, down_weights, right_weights
def calculate_longest_manhattan_path(rows: int, cols: int, down_weights: List[List[int]], right_weights: List[List[int]]) -> int:
= [[0] * (cols + 1) for _ in range(rows + 1)]
path_scores
for i in range(1, rows + 1):
0] = path_scores[i - 1][0] + down_weights[i - 1][0]
path_scores[i][
for j in range(1, cols + 1):
0][j] = path_scores[0][j - 1] + right_weights[0][j - 1]
path_scores[
for i in range(1, rows + 1):
for j in range(1, cols + 1):
= max(path_scores[i - 1][j] + down_weights[i - 1][j],
path_scores[i][j] - 1] + right_weights[i][j - 1])
path_scores[i][j
return path_scores[rows][cols]
= """
sample_input 4 4
1 0 2 4 3
4 6 5 2 1
4 4 5 2 1
5 6 8 5 3
-
3 2 4 0
3 2 4 2
0 7 3 3
3 3 0 2
1 3 2 2
"""
= parse_manhattan_tourist_input(sample_input)
rows, cols, down_weights, right_weights = calculate_longest_manhattan_path(rows, cols, down_weights, right_weights)
longest_path_score print(f"{longest_path_score}")
50 Find a Longest Common Subsequence of Two Strings
Longest Common Subsequence Problem.
Given: Two strings.
Return: A longest common subsequence of these strings.
50.1 Sample Dataset
AACCTTGG
ACACTGTGA
50.2 Sample Output
AACTGG
50.3 Solution
from typing import List, Tuple
def longest_common_subsequence(sequence1: str, sequence2: str) -> str:
= '-' + sequence1
padded_seq1 = '-' + sequence2
padded_seq2
int]] = [[0 for _ in range(len(padded_seq2))] for _ in range(len(padded_seq1))]
score_matrix: List[List[str]] = [['' for _ in range(len(padded_seq2))] for _ in range(len(padded_seq1))]
backtrack_matrix: List[List[
for i in range(1, len(padded_seq1)):
for j in range(1, len(padded_seq2)):
= score_matrix[i - 1][j - 1] + (1 if padded_seq1[i] == padded_seq2[j] else 0)
match_score = max(score_matrix[i - 1][j], score_matrix[i][j - 1], match_score)
score_matrix[i][j]
if score_matrix[i][j] == score_matrix[i - 1][j]:
= "up"
backtrack_matrix[i][j] elif score_matrix[i][j] == score_matrix[i][j - 1]:
= "left"
backtrack_matrix[i][j] else:
= "diag"
backtrack_matrix[i][j]
= ""
lcs = len(padded_seq1) - 1, len(padded_seq2) - 1
i, j while i > 0 and j > 0:
if backtrack_matrix[i][j] == "diag":
= padded_seq1[i] + lcs
lcs -= 1
i -= 1
j elif backtrack_matrix[i][j] == "left":
-= 1
j else:
-= 1
i
return lcs
= """
sample_input AACCTTGG
ACACTGTGA
"""
= sample_input.strip().split("\n")
input_lines = input_lines[0]
sequence1 = input_lines[1]
sequence2
print(longest_common_subsequence(sequence1, sequence2))
51 Find the Longest Path in a DAG
Longest Path in a DAG Problem. Find a longest path between two nodes in an edge-weighted DAG.
Given: An integer representing the source node of a graph, followed by an integer representing the sink node of the graph, followed by an edge-weighted graph. The graph is represented by a modified adjacency list in which the notation”0->1:7”indicates that an edge connects node 0 to node 1 with weight 7.
Return: The length of a longest path in the graph, followed by a longest path. (If multiple longest paths exist, you may return any one.)
51.1 Sample Dataset
0
4
0->1:7
0->2:4
2->3:2
1->4:1
3->4:3
51.2 Sample Output
9
0->2->3->4
51.3 Solution
from typing import List, Tuple, Dict, Optional
class Node:
def __init__(self, label: str):
self.label: str = label
self.parent_nodes: List[Tuple['Node', int]] = []
self.target_nodes: List[Tuple['Node', int]] = []
self.visited: bool = False
class DAG:
def __init__(self):
self.nodes_dict: Dict[str, Node] = {}
self.distances: Dict[str, float] = {}
self.backtrack: Dict[str, Optional[str]] = {}
def add_node(self, label: str) -> Node:
if label in self.nodes_dict:
return self.nodes_dict[label]
= Node(label)
new_node self.nodes_dict[label] = new_node
return new_node
def construct_dag(self, adjacency_list: List[str]) -> None:
for line in adjacency_list:
= line.split("->")
source_label, temp = temp.split(":")
target_label, weight_str = int(weight_str)
weight
= self.add_node(source_label)
source_node = self.add_node(target_label)
target_node
source_node.target_nodes.append((target_node, weight))
target_node.parent_nodes.append((source_node, weight))
def topological_sort_util(self, current_node: Node, sorted_labels: List[str]) -> None:
= True
current_node.visited for neighbor, _ in current_node.target_nodes:
if not neighbor.visited:
self.topological_sort_util(neighbor, sorted_labels)
0, current_node.label)
sorted_labels.insert(
def topological_sort(self) -> List[str]:
str] = []
sorted_labels: List[for node in self.nodes_dict.values():
if not node.visited:
self.topological_sort_util(node, sorted_labels)
return sorted_labels
def longest_path(self, source: str, sink: str) -> Tuple[float, List[str]]:
for label in self.nodes_dict:
self.distances[label] = float("-inf")
self.distances[source] = 0
self.backtrack[source] = None
= self.topological_sort()
topological_order for label in topological_order:
= self.nodes_dict[label]
current_node for target_node, weight in current_node.target_nodes:
if self.distances[target_node.label] < self.distances[label] + weight:
self.distances[target_node.label] = self.distances[label] + weight
self.backtrack[target_node.label] = label
str] = [sink]
path: List[= self.backtrack[sink]
current_label while current_label != source:
= [current_label] + path
path = self.backtrack[current_label]
current_label = [source] + path
path return self.distances[sink], path
# Sample input
str = """
sample_input: 0
4
0->1:7
0->2:4
2->3:2
1->4:1
3->4:3
"""
= sample_input.strip().split("\n")
input_lines str = input_lines[0]
source_label: str = input_lines[1]
sink_label: str] = input_lines[2:]
adjacency_list: List[
= DAG()
graph
graph.construct_dag(adjacency_list)= graph.longest_path(source_label, sink_label)
longest_distance, longest_path print(longest_distance)
print("->".join(longest_path))
52 Find a Highest-Scoring Alignment of Two Strings
Global Alignment Problem. Find the highest-scoring alignment between two strings using a scoring matrix.
Given: Two amino acid strings.
Return: The maximum alignment score of these strings followed by an alignment achieving this maximum score. Use the BLOSUM62 scoring matrix and indel penalty σ = 5. (If multiple alignments achieving the maximum score exist, you may return any one.)
52.1 Sample Dataset
PLEASANTLY
MEANLY
52.2 Sample Output
8
PLEASANTLY
-MEA--N-LY
52.3 Solution
from typing import Dict, Tuple, List
str, str], int] = {
BLOSUM62: Dict[Tuple['W', 'F'): 1, ('L', 'R'): -2, ('S', 'P'): -1, ('V', 'T'): 0,
('Q', 'Q'): 5, ('N', 'A'): -2, ('Z', 'Y'): -2, ('W', 'R'): -3,
('Q', 'A'): -1, ('S', 'D'): 0, ('H', 'H'): 8, ('S', 'H'): -1,
('H', 'D'): -1, ('L', 'N'): -3, ('W', 'A'): -3, ('Y', 'M'): -1,
('G', 'R'): -2, ('Y', 'I'): -1, ('Y', 'E'): -2, ('B', 'Y'): -3,
('Y', 'A'): -2, ('V', 'D'): -3, ('B', 'S'): 0, ('Y', 'Y'): 7,
('G', 'N'): 0, ('E', 'C'): -4, ('Y', 'Q'): -1, ('Z', 'Z'): 4,
('V', 'A'): 0, ('C', 'C'): 9, ('M', 'R'): -1, ('V', 'E'): -2,
('T', 'N'): 0, ('P', 'P'): 7, ('V', 'I'): 3, ('V', 'S'): -2,
('Z', 'P'): -1, ('V', 'M'): 1, ('T', 'F'): -2, ('V', 'Q'): -2,
('K', 'K'): 5, ('P', 'D'): -1, ('I', 'H'): -3, ('I', 'D'): -3,
('T', 'R'): -1, ('P', 'L'): -3, ('K', 'G'): -2, ('M', 'N'): -2,
('P', 'H'): -2, ('F', 'Q'): -3, ('Z', 'G'): -2, ('X', 'L'): -1,
('T', 'M'): -1, ('Z', 'C'): -3, ('X', 'H'): -1, ('D', 'R'): -2,
('B', 'W'): -4, ('X', 'D'): -1, ('Z', 'K'): 1, ('F', 'A'): -2,
('Z', 'W'): -3, ('F', 'E'): -3, ('D', 'N'): 1, ('B', 'K'): 0,
('X', 'X'): -1, ('F', 'I'): 0, ('B', 'G'): -1, ('X', 'T'): 0,
('F', 'M'): 0, ('B', 'C'): -3, ('Z', 'I'): -3, ('Z', 'V'): -2,
('S', 'S'): 4, ('L', 'Q'): -2, ('W', 'E'): -3, ('Q', 'R'): 1,
('N', 'N'): 6, ('W', 'M'): -1, ('Q', 'C'): -3, ('W', 'I'): -3,
('S', 'C'): -1, ('L', 'A'): -1, ('S', 'G'): 0, ('L', 'E'): -3,
('W', 'Q'): -2, ('H', 'G'): -2, ('S', 'K'): 0, ('Q', 'N'): 0,
('N', 'R'): 0, ('H', 'C'): -3, ('Y', 'N'): -2, ('G', 'Q'): -2,
('Y', 'F'): 3, ('C', 'A'): 0, ('V', 'L'): 1, ('G', 'E'): -2,
('G', 'A'): 0, ('K', 'R'): 2, ('E', 'D'): 2, ('Y', 'R'): -2,
('M', 'Q'): 0, ('T', 'I'): -1, ('C', 'D'): -3, ('V', 'F'): -1,
('T', 'A'): 0, ('T', 'P'): -1, ('B', 'P'): -2, ('T', 'E'): -1,
('V', 'N'): -3, ('P', 'G'): -2, ('M', 'A'): -1, ('K', 'H'): -1,
('V', 'R'): -3, ('P', 'C'): -3, ('M', 'E'): -2, ('K', 'L'): -2,
('V', 'V'): 4, ('M', 'I'): 1, ('T', 'Q'): -1, ('I', 'G'): -4,
('P', 'K'): -1, ('M', 'M'): 5, ('K', 'D'): -1, ('I', 'C'): -1,
('Z', 'D'): 1, ('F', 'R'): -3, ('X', 'K'): -1, ('Q', 'D'): 0,
('X', 'G'): -1, ('Z', 'L'): -3, ('X', 'C'): -2, ('Z', 'H'): 0,
('B', 'L'): -4, ('B', 'H'): 0, ('F', 'F'): 6, ('X', 'W'): -2,
('B', 'D'): 4, ('D', 'A'): -2, ('S', 'L'): -2, ('X', 'S'): 0,
('F', 'N'): -3, ('S', 'R'): -1, ('W', 'D'): -4, ('V', 'Y'): -1,
('W', 'L'): -2, ('H', 'R'): 0, ('W', 'H'): -2, ('H', 'N'): 1,
('W', 'T'): -2, ('T', 'T'): 5, ('S', 'F'): -2, ('W', 'P'): -4,
('L', 'D'): -4, ('B', 'I'): -3, ('L', 'H'): -3, ('S', 'N'): 1,
('B', 'T'): -1, ('L', 'L'): 4, ('Y', 'K'): -2, ('E', 'Q'): 2,
('Y', 'G'): -3, ('Z', 'S'): 0, ('Y', 'C'): -2, ('G', 'D'): -1,
('B', 'V'): -3, ('E', 'A'): -1, ('Y', 'W'): 2, ('E', 'E'): 5,
('Y', 'S'): -2, ('C', 'N'): -3, ('V', 'C'): -1, ('T', 'H'): -2,
('P', 'R'): -2, ('V', 'G'): -3, ('T', 'L'): -1, ('V', 'K'): -2,
('K', 'Q'): 1, ('R', 'A'): -1, ('I', 'R'): -3, ('T', 'D'): -1,
('P', 'F'): -4, ('I', 'N'): -3, ('K', 'I'): -3, ('M', 'D'): -3,
('V', 'W'): -3, ('W', 'W'): 11, ('M', 'H'): -2, ('P', 'N'): -2,
('K', 'A'): -1, ('M', 'L'): 2, ('K', 'E'): 1, ('Z', 'E'): 4,
('X', 'N'): -1, ('Z', 'A'): -1, ('Z', 'M'): -1, ('X', 'F'): -1,
('K', 'C'): -3, ('B', 'Q'): 0, ('X', 'B'): -1, ('B', 'M'): -3,
('F', 'C'): -2, ('Z', 'Q'): 3, ('X', 'Z'): -1, ('F', 'G'): -3,
('B', 'E'): 1, ('X', 'V'): -1, ('F', 'K'): -3, ('B', 'A'): -2,
('X', 'R'): -1, ('D', 'D'): 6, ('W', 'G'): -2, ('Z', 'F'): -3,
('S', 'Q'): 0, ('W', 'C'): -2, ('W', 'K'): -3, ('H', 'Q'): 0,
('L', 'C'): -1, ('W', 'N'): -4, ('S', 'A'): 1, ('L', 'G'): -4,
('W', 'S'): -3, ('S', 'E'): 0, ('H', 'E'): 0, ('S', 'I'): -2,
('H', 'A'): -2, ('S', 'M'): -1, ('Y', 'L'): -1, ('Y', 'H'): 2,
('Y', 'D'): -3, ('E', 'R'): 0, ('X', 'P'): -2, ('G', 'G'): 6,
('G', 'C'): -3, ('E', 'N'): 0, ('Y', 'T'): -2, ('Y', 'P'): -3,
('T', 'K'): -1, ('A', 'A'): 4, ('P', 'Q'): -1, ('T', 'C'): -1,
('V', 'H'): -3, ('T', 'G'): -2, ('I', 'Q'): -3, ('Z', 'T'): -1,
('C', 'R'): -3, ('V', 'P'): -2, ('P', 'E'): -1, ('M', 'C'): -1,
('K', 'N'): 0, ('I', 'I'): 4, ('P', 'A'): -1, ('M', 'G'): -3,
('T', 'S'): 1, ('I', 'E'): -3, ('P', 'M'): -2, ('M', 'K'): -1,
('I', 'A'): -1, ('P', 'I'): -3, ('R', 'R'): 5, ('X', 'M'): -1,
('L', 'I'): 2, ('X', 'I'): -1, ('Z', 'B'): 1, ('X', 'E'): -1,
('Z', 'N'): 0, ('X', 'A'): 0, ('B', 'R'): -1, ('B', 'N'): 3,
('F', 'D'): -3, ('X', 'Y'): -1, ('Z', 'R'): 0, ('F', 'H'): -1,
('B', 'F'): -3, ('F', 'L'): 0, ('X', 'Q'): -1, ('B', 'B'): 4
(
}
def global_alignment(sequence1: str, sequence2: str, indel_penalty: int = 5) -> Tuple[int, str, str]:
str = "-" + sequence1
padded_seq1: str = "-" + sequence2
padded_seq2:
int]] = [[0 for _ in range(len(padded_seq2))] for _ in range(len(padded_seq1))]
score_matrix: List[List[str]] = [['' for _ in range(len(padded_seq2))] for _ in range(len(padded_seq1))]
backtrack_matrix: List[List[
for j in range(len(padded_seq2)):
0][j] = -indel_penalty * j
score_matrix[0][j] = "left"
backtrack_matrix[
for i in range(len(padded_seq1)):
0] = -indel_penalty * i
score_matrix[i][0] = "up"
backtrack_matrix[i][
for i in range(1, len(padded_seq1)):
for j in range(1, len(padded_seq2)):
str, str] = (padded_seq1[i], padded_seq2[j]) if (padded_seq1[i], padded_seq2[j]) in BLOSUM62 else (padded_seq2[j], padded_seq1[i])
key: Tuple[int = score_matrix[i - 1][j - 1] + BLOSUM62[key]
diagonal_score: int = score_matrix[i - 1][j] - indel_penalty
up_score: int = score_matrix[i][j - 1] - indel_penalty
left_score: = max(diagonal_score, up_score, left_score)
score_matrix[i][j]
if score_matrix[i][j] == diagonal_score:
= "diagonal"
backtrack_matrix[i][j] elif score_matrix[i][j] == up_score:
= "up"
backtrack_matrix[i][j] else:
= "left"
backtrack_matrix[i][j]
int = len(padded_seq1) - 1
i: int = len(padded_seq2) - 1
j: str = ""
aligned_seq1: str = ""
aligned_seq2:
while i != 0 or j != 0:
str = backtrack_matrix[i][j]
direction: if direction == "diagonal":
= padded_seq1[i] + aligned_seq1
aligned_seq1 = padded_seq2[j] + aligned_seq2
aligned_seq2 -= 1
i -= 1
j elif direction == "up":
= padded_seq1[i] + aligned_seq1
aligned_seq1 = "-" + aligned_seq2
aligned_seq2 -= 1
i else:
= "-" + aligned_seq1
aligned_seq1 = padded_seq2[j] + aligned_seq2
aligned_seq2 -= 1
j
return score_matrix[len(padded_seq1) - 1][len(padded_seq2) - 1], aligned_seq1, aligned_seq2
# Sample input
str = """
sample_input: PLEASANTLY
MEANLY
"""
str] = sample_input.strip().split("\n")
input_lines: List[str = input_lines[0]
sequence1: str = input_lines[1]
sequence2: int
alignment_score: str
aligned_sequence1: str
aligned_sequence2: = global_alignment(sequence1, sequence2)
alignment_score, aligned_sequence1, aligned_sequence2 print(alignment_score)
print(aligned_sequence1)
print(aligned_sequence2)
53 Find a Highest-Scoring Local Alignment of Two Strings
Local Alignment Problem, Find the highest-scoring local alignment between two strings.
Given: Two amino acid strings.
Return: The maximum score of a local alignment of the strings, followed by a local alignment of these strings achieving the maximum score. Use the PAM250 scoring matrix and indel penalty σ = 5. (If multiple local alignments achieving the maximum score exist, you may return any one.)
53.1 Sample Dataset
MEANLY
PENALTY
53.2 Sample Output
15
EANL-Y
ENALTY
53.3 Solution
from typing import Dict, List, Tuple, Optional
str, Dict[str, int]] = {
PAM250: Dict['A': {'A': 2, 'C': -2, 'D': 0, 'E': 0, 'F': -3, 'G': 1, 'H': -1, 'I': -1, 'K': -1, 'L': -2, 'M': -1, 'N': 0,
'P': 1, 'Q': 0, 'R': -2, 'S': 1, 'T': 1, 'V': 0, 'W': -6, 'Y': -3},
'C': {'A': -2, 'C': 12, 'D': -5, 'E': -5, 'F': -4, 'G': -3, 'H': -3, 'I': -2, 'K': -5, 'L': -6, 'M': -5,
'N': -4, 'P': -3, 'Q': -5, 'R': -4, 'S': 0, 'T': -2, 'V': -2, 'W': -8, 'Y': 0},
'D': {'A': 0, 'C': -5, 'D': 4, 'E': 3, 'F': -6, 'G': 1, 'H': 1, 'I': -2, 'K': 0, 'L': -4, 'M': -3, 'N': 2,
'P': -1, 'Q': 2, 'R': -1, 'S': 0, 'T': 0, 'V': -2, 'W': -7, 'Y': -4},
'E': {'A': 0, 'C': -5, 'D': 3, 'E': 4, 'F': -5, 'G': 0, 'H': 1, 'I': -2, 'K': 0, 'L': -3, 'M': -2, 'N': 1,
'P': -1, 'Q': 2, 'R': -1, 'S': 0, 'T': 0, 'V': -2, 'W': -7, 'Y': -4},
'F': {'A': -3, 'C': -4, 'D': -6, 'E': -5, 'F': 9, 'G': -5, 'H': -2, 'I': 1, 'K': -5, 'L': 2, 'M': 0, 'N': -3,
'P': -5, 'Q': -5, 'R': -4, 'S': -3, 'T': -3, 'V': -1, 'W': 0, 'Y': 7},
'G': {'A': 1, 'C': -3, 'D': 1, 'E': 0, 'F': -5, 'G': 5, 'H': -2, 'I': -3, 'K': -2, 'L': -4, 'M': -3, 'N': 0,
'P': 0, 'Q': -1, 'R': -3, 'S': 1, 'T': 0, 'V': -1, 'W': -7, 'Y': -5},
'H': {'A': -1, 'C': -3, 'D': 1, 'E': 1, 'F': -2, 'G': -2, 'H': 6, 'I': -2, 'K': 0, 'L': -2, 'M': -2, 'N': 2,
'P': 0, 'Q': 3, 'R': 2, 'S': -1, 'T': -1, 'V': -2, 'W': -3, 'Y': 0},
'I': {'A': -1, 'C': -2, 'D': -2, 'E': -2, 'F': 1, 'G': -3, 'H': -2, 'I': 5, 'K': -2, 'L': 2, 'M': 2, 'N': -2,
'P': -2, 'Q': -2, 'R': -2, 'S': -1, 'T': 0, 'V': 4, 'W': -5, 'Y': -1},
'K': {'A': -1, 'C': -5, 'D': 0, 'E': 0, 'F': -5, 'G': -2, 'H': 0, 'I': -2, 'K': 5, 'L': -3, 'M': 0, 'N': 1,
'P': -1, 'Q': 1, 'R': 3, 'S': 0, 'T': 0, 'V': -2, 'W': -3, 'Y': -4},
'L': {'A': -2, 'C': -6, 'D': -4, 'E': -3, 'F': 2, 'G': -4, 'H': -2, 'I': 2, 'K': -3, 'L': 6, 'M': 4, 'N': -3,
'P': -3, 'Q': -2, 'R': -3, 'S': -3, 'T': -2, 'V': 2, 'W': -2, 'Y': -1},
'M': {'A': -1, 'C': -5, 'D': -3, 'E': -2, 'F': 0, 'G': -3, 'H': -2, 'I': 2, 'K': 0, 'L': 4, 'M': 6, 'N': -2,
'P': -2, 'Q': -1, 'R': 0, 'S': -2, 'T': -1, 'V': 2, 'W': -4, 'Y': -2},
'N': {'A': 0, 'C': -4, 'D': 2, 'E': 1, 'F': -3, 'G': 0, 'H': 2, 'I': -2, 'K': 1, 'L': -3, 'M': -2, 'N': 2,
'P': 0, 'Q': 1, 'R': 0, 'S': 1, 'T': 0, 'V': -2, 'W': -4, 'Y': -2},
'P': {'A': 1, 'C': -3, 'D': -1, 'E': -1, 'F': -5, 'G': 0, 'H': 0, 'I': -2, 'K': -1, 'L': -3, 'M': -2, 'N': 0,
'P': 6, 'Q': 0, 'R': 0, 'S': 1, 'T': 0, 'V': -1, 'W': -6, 'Y': -5},
'Q': {'A': 0, 'C': -5, 'D': 2, 'E': 2, 'F': -5, 'G': -1, 'H': 3, 'I': -2, 'K': 1, 'L': -2, 'M': -1, 'N': 1,
'P': 0, 'Q': 4, 'R': 1, 'S': -1, 'T': -1, 'V': -2, 'W': -5, 'Y': -4},
'R': {'A': -2, 'C': -4, 'D': -1, 'E': -1, 'F': -4, 'G': -3, 'H': 2, 'I': -2, 'K': 3, 'L': -3, 'M': 0, 'N': 0,
'P': 0, 'Q': 1, 'R': 6, 'S': 0, 'T': -1, 'V': -2, 'W': 2, 'Y': -4},
'S': {'A': 1, 'C': 0, 'D': 0, 'E': 0, 'F': -3, 'G': 1, 'H': -1, 'I': -1, 'K': 0, 'L': -3, 'M': -2, 'N': 1,
'P': 1, 'Q': -1, 'R': 0, 'S': 2, 'T': 1, 'V': -1, 'W': -2, 'Y': -3},
'T': {'A': 1, 'C': -2, 'D': 0, 'E': 0, 'F': -3, 'G': 0, 'H': -1, 'I': 0, 'K': 0, 'L': -2, 'M': -1, 'N': 0,
'P': 0, 'Q': -1, 'R': -1, 'S': 1, 'T': 3, 'V': 0, 'W': -5, 'Y': -3},
'V': {'A': 0, 'C': -2, 'D': -2, 'E': -2, 'F': -1, 'G': -1, 'H': -2, 'I': 4, 'K': -2, 'L': 2, 'M': 2, 'N': -2,
'P': -1, 'Q': -2, 'R': -2, 'S': -1, 'T': 0, 'V': 4, 'W': -6, 'Y': -2},
'W': {'A': -6, 'C': -8, 'D': -7, 'E': -7, 'F': 0, 'G': -7, 'H': -3, 'I': -5, 'K': -3, 'L': -2, 'M': -4,
'N': -4, 'P': -6, 'Q': -5, 'R': 2, 'S': -2, 'T': -5, 'V': -6, 'W': 17, 'Y': 0},
'Y': {'A': -3, 'C': 0, 'D': -4, 'E': -4, 'F': 7, 'G': -5, 'H': 0, 'I': -1, 'K': -4, 'L': -1, 'M': -2, 'N': -2,
'P': -5, 'Q': -4, 'R': -4, 'S': -3, 'T': -3, 'V': -2, 'W': 0, 'Y': 10}}
def local_alignment(sequence1: str, sequence2: str, indel_penalty: int = 5) -> Tuple[int, str, str]:
str = "-" + sequence1
padded_seq1: str = "-" + sequence2
padded_seq2:
int]] = [[0 for _ in range(len(padded_seq2))] for _ in range(len(padded_seq1))]
score_matrix: List[List[str]]] = [[None for _ in range(len(padded_seq2))] for _ in range(len(padded_seq1))]
backtrack_matrix: List[List[Optional[
for i in range(1, len(padded_seq1)):
for j in range(1, len(padded_seq2)):
str = padded_seq1[i] if padded_seq1[i] in PAM250 else padded_seq2[j]
key1: str = padded_seq2[j] if padded_seq1[i] in PAM250 else padded_seq1[i]
key2:
int = score_matrix[i - 1][j - 1] + PAM250[key1][key2]
diagonal_score: int = score_matrix[i - 1][j] - indel_penalty
up_score: int = score_matrix[i][j - 1] - indel_penalty
left_score: = max(diagonal_score, up_score, left_score, 0)
score_matrix[i][j]
if score_matrix[i][j] == diagonal_score:
= "diagonal"
backtrack_matrix[i][j] elif score_matrix[i][j] == up_score:
= "up"
backtrack_matrix[i][j] elif score_matrix[i][j] == left_score:
= "left"
backtrack_matrix[i][j]
int = -1
max_score: int = 0
max_i: int = 0
max_j: for i in range(len(padded_seq1)):
for j in range(len(padded_seq2)):
if score_matrix[i][j] > max_score:
= score_matrix[i][j]
max_score = i, j
max_i, max_j
int = max_i
i: int = max_j
j: str = ""
aligned_seq1: str = ""
aligned_seq2: while backtrack_matrix[i][j] is not None:
str = backtrack_matrix[i][j]
direction: if direction == "diagonal":
= padded_seq1[i] + aligned_seq1
aligned_seq1 = padded_seq2[j] + aligned_seq2
aligned_seq2 -= 1
i -= 1
j elif direction == "up":
= padded_seq1[i] + aligned_seq1
aligned_seq1 = "-" + aligned_seq2
aligned_seq2 -= 1
i else:
= "-" + aligned_seq1
aligned_seq1 = padded_seq2[j] + aligned_seq2
aligned_seq2 -= 1
j
return max_score, aligned_seq1, aligned_seq2
# Sample input
str = """
sample_input: MEANLY
PENALTY
"""
str] = sample_input.strip().split("\n")
input_lines: List[str = input_lines[0]
sequence1: str = input_lines[1]
sequence2:
int
alignment_score: str
aligned_sequence1: str
aligned_sequence2: = local_alignment(sequence1, sequence2)
alignment_score, aligned_sequence1, aligned_sequence2 print(alignment_score)
print(aligned_sequence1)
print(aligned_sequence2)
54 Compute the Edit Distance Between Two Strings
Edit Distance Problem. Find the edit distance between two strings.
Given: Two amino acid strings.
Return: The edit distance between these strings.
54.1 Sample Dataset
PLEASANTLY
MEANLY
54.2 Sample Output
5
54.3 Solution
def calculate_edit_distance(source: str, target: str) -> int:
= {}
distance_matrix
for target_index in range(len(target) + 1):
0] = target_index
distance_matrix[target_index,
for source_index in range(len(source) + 1):
0, source_index] = source_index
distance_matrix[
for target_index in range(len(target)):
for source_index in range(len(source)):
if source[source_index] == target[target_index]:
+ 1, source_index + 1] = distance_matrix[target_index, source_index]
distance_matrix[target_index else:
+ 1, source_index + 1] = min([
distance_matrix[target_index + 1, source_index],
distance_matrix[target_index
distance_matrix[target_index, source_index],+ 1]
distance_matrix[target_index, source_index + 1
])
return distance_matrix[len(target), len(source)]
str = """
sample_input: PLEASANTLY
MEANLY
"""
= sample_input.strip().split("\n")
source, target print(calculate_edit_distance(source, target))
55 Find a Highest-Scoring Fitting Alignment of Two Strings
Fitting Alignment Problem. Construct a highest-scoring fitting alignment between two strings.
Given: Two DNA strings v and w, where v has length at most 10000 and w has length at most 1000.
Return: The maximum score of a fitting alignment of v and w, followed by a fitting alignment achieving this maximum score. Use the simple scoring method in which matches count +1 and both the mismatch and indel penalties are equal to 1. (If multiple fitting alignments achieving the maximum score exist, you may return any one.)
55.1 Sample Dataset
GTAGGCTTAAGGTTA
TAGATA
55.2 Sample Output
2
TAGGCTTA
TAGA--TA
55.3 Solution
from typing import Tuple, Dict
def calculate_fitting_alignment(sequence1: str, sequence2: str) -> Tuple[int, str, str]:
int, int], int] = {}
score_matrix: Dict[Tuple[int, int], str] = {}
path_matrix: Dict[Tuple[
for seq2_index in range(len(sequence2) + 1):
0] = -seq2_index
score_matrix[seq2_index, 0] = "↑"
path_matrix[seq2_index, for seq1_index in range(len(sequence1) + 1):
0, seq1_index] = 0
score_matrix[0, seq1_index] = "←"
path_matrix[
0, 0] = 0
score_matrix[for seq2_index in range(len(sequence2)):
for seq1_index in range(len(sequence1)):
= (seq2_index + 1, seq1_index + 1)
current_position = 1 if sequence1[seq1_index] == sequence2[seq2_index] else -1
match_score = [
options + match_score,
score_matrix[seq2_index, seq1_index] + 1] - 1,
score_matrix[seq2_index, seq1_index + 1, seq1_index] - 1,
score_matrix[seq2_index
]= max(options)
score_matrix[current_position] = ["↖", "↑", "←"][options.index(max(options))]
path_matrix[current_position]
= [score_matrix[len(sequence2), i] for i in range(len(sequence1) + 1)]
final_scores = max(final_scores)
max_score = final_scores.index(max_score)
seq1_end_index = len(sequence2)
seq2_end_index
= "", ""
aligned_seq1, aligned_seq2 while seq1_end_index > 0 and seq2_end_index > 0:
if path_matrix[seq2_end_index, seq1_end_index] == "↖":
+= sequence1[seq1_end_index - 1]
aligned_seq1 += sequence2[seq2_end_index - 1]
aligned_seq2 = seq2_end_index - 1, seq1_end_index - 1
seq2_end_index, seq1_end_index elif path_matrix[seq2_end_index, seq1_end_index] == "←":
+= sequence1[seq1_end_index - 1]
aligned_seq1 += "-"
aligned_seq2 = seq1_end_index - 1
seq1_end_index elif path_matrix[seq2_end_index, seq1_end_index] == "↑":
+= "-"
aligned_seq1 += sequence2[seq2_end_index - 1]
aligned_seq2 = seq2_end_index - 1
seq2_end_index
return max_score, aligned_seq1[::-1], aligned_seq2[::-1]
str = """
sample_input: GTAGGCTTAAGGTTA
TAGATA
"""
= sample_input.strip().split("\n")
sequence1, sequence2 print(*calculate_fitting_alignment(sequence1, sequence2), sep="\n")
56 Find a Highest-Scoring Overlap Alignment of Two Strings
Overlap Alignment Problem. Construct a highest-scoring overlap alignment between two strings.
Given: Two protein strings v and w, each of length at most 1000.
Return: The score of an optimal overlap alignment of v and w, followed by an alignment of a suffix v’ of v and a prefix w’ of w achieving this maximum score. Use an alignment score in which matches count +1 and both the mismatch and indel penalties are 2. (If multiple overlap alignments achieving the maximum score exist, you may return any one.)
56.1 Sample Dataset
PAWHEAE
HEAGAWGHEE
56.2 Sample Output
1
HEAE
HEAG
56.3 Solution
from typing import Tuple, Dict
def calculate_overlap_alignment(sequence1: str, sequence2: str, mismatch_penalty: int = -2) -> Tuple[int, str, str]:
int, int], int] = {}
score_matrix: Dict[Tuple[int, int], str] = {}
path_matrix: Dict[Tuple[
for seq2_index in range(len(sequence2) + 1):
0] = seq2_index * mismatch_penalty
score_matrix[seq2_index, 0] = "↑"
path_matrix[seq2_index, for seq1_index in range(len(sequence1) + 1):
0, seq1_index] = 0
score_matrix[0, seq1_index] = "←"
path_matrix[
0, 0] = 0
score_matrix[for seq2_index in range(len(sequence2)):
for seq1_index in range(len(sequence1)):
= (seq2_index + 1, seq1_index + 1)
current_position = 1 if sequence1[seq1_index] == sequence2[seq2_index] else mismatch_penalty
match_score = [
options + match_score,
score_matrix[seq2_index, seq1_index] + 1] + mismatch_penalty,
score_matrix[seq2_index, seq1_index + 1, seq1_index] + mismatch_penalty,
score_matrix[seq2_index
]= max(options)
score_matrix[current_position] = ["↖", "↑", "←"][options.index(max(options))]
path_matrix[current_position]
= [score_matrix[seq2_index, len(sequence1)] for seq2_index in range(len(sequence2) + 1)]
final_scores = max(final_scores)
max_score = final_scores.index(max_score)
seq2_end_index = len(sequence1)
seq1_end_index
= "", ""
aligned_seq1, aligned_seq2 while seq1_end_index > 0 and seq2_end_index > 0:
if path_matrix[seq2_end_index, seq1_end_index] == "↖":
+= sequence1[seq1_end_index - 1]
aligned_seq1 += sequence2[seq2_end_index - 1]
aligned_seq2 = seq2_end_index - 1, seq1_end_index - 1
seq2_end_index, seq1_end_index elif path_matrix[seq2_end_index, seq1_end_index] == "←":
+= sequence1[seq1_end_index - 1]
aligned_seq1 += "-"
aligned_seq2 = seq1_end_index - 1
seq1_end_index elif path_matrix[seq2_end_index, seq1_end_index] == "↑":
+= "-"
aligned_seq1 += sequence2[seq2_end_index - 1]
aligned_seq2 = seq2_end_index - 1
seq2_end_index
return max_score, aligned_seq1[::-1], aligned_seq2[::-1]
str = """
sample_input: PAWHEAE
HEAGAWGHEE
"""
= sample_input.strip().split("\n")
sequence1, sequence2 print(*calculate_overlap_alignment(sequence1, sequence2), sep="\n")
57 Align Two Strings Using Affine Gap Penalties
Alignment with Affine Gap Penalties Problem. Construct a highest-scoring global alignment of two strings (with affine gap penalties).
Given: Two amino acid strings v and w (each of length at most 100).
Return: The maximum alignment score between v and w, followed by an alignment of v and w achieving this maximum score. Use the BLOSUM62 scoring matrix, a gap opening penalty of 11, and a gap extension penalty of 1.
57.1 Sample Dataset
PRTEINS
PRTWPSEIN
57.2 Sample Output
8
PRT---EINS
PRTWPSEIN-
57.3 Solution
from typing import Dict, Tuple, List, Optional
str, str], int] = {
BLOSUM62: Dict[Tuple['W', 'F'): 1, ('L', 'R'): -2, ('S', 'P'): -1, ('V', 'T'): 0,
('Q', 'Q'): 5, ('N', 'A'): -2, ('Z', 'Y'): -2, ('W', 'R'): -3,
('Q', 'A'): -1, ('S', 'D'): 0, ('H', 'H'): 8, ('S', 'H'): -1,
('H', 'D'): -1, ('L', 'N'): -3, ('W', 'A'): -3, ('Y', 'M'): -1,
('G', 'R'): -2, ('Y', 'I'): -1, ('Y', 'E'): -2, ('B', 'Y'): -3,
('Y', 'A'): -2, ('V', 'D'): -3, ('B', 'S'): 0, ('Y', 'Y'): 7,
('G', 'N'): 0, ('E', 'C'): -4, ('Y', 'Q'): -1, ('Z', 'Z'): 4,
('V', 'A'): 0, ('C', 'C'): 9, ('M', 'R'): -1, ('V', 'E'): -2,
('T', 'N'): 0, ('P', 'P'): 7, ('V', 'I'): 3, ('V', 'S'): -2,
('Z', 'P'): -1, ('V', 'M'): 1, ('T', 'F'): -2, ('V', 'Q'): -2,
('K', 'K'): 5, ('P', 'D'): -1, ('I', 'H'): -3, ('I', 'D'): -3,
('T', 'R'): -1, ('P', 'L'): -3, ('K', 'G'): -2, ('M', 'N'): -2,
('P', 'H'): -2, ('F', 'Q'): -3, ('Z', 'G'): -2, ('X', 'L'): -1,
('T', 'M'): -1, ('Z', 'C'): -3, ('X', 'H'): -1, ('D', 'R'): -2,
('B', 'W'): -4, ('X', 'D'): -1, ('Z', 'K'): 1, ('F', 'A'): -2,
('Z', 'W'): -3, ('F', 'E'): -3, ('D', 'N'): 1, ('B', 'K'): 0,
('X', 'X'): -1, ('F', 'I'): 0, ('B', 'G'): -1, ('X', 'T'): 0,
('F', 'M'): 0, ('B', 'C'): -3, ('Z', 'I'): -3, ('Z', 'V'): -2,
('S', 'S'): 4, ('L', 'Q'): -2, ('W', 'E'): -3, ('Q', 'R'): 1,
('N', 'N'): 6, ('W', 'M'): -1, ('Q', 'C'): -3, ('W', 'I'): -3,
('S', 'C'): -1, ('L', 'A'): -1, ('S', 'G'): 0, ('L', 'E'): -3,
('W', 'Q'): -2, ('H', 'G'): -2, ('S', 'K'): 0, ('Q', 'N'): 0,
('N', 'R'): 0, ('H', 'C'): -3, ('Y', 'N'): -2, ('G', 'Q'): -2,
('Y', 'F'): 3, ('C', 'A'): 0, ('V', 'L'): 1, ('G', 'E'): -2,
('G', 'A'): 0, ('K', 'R'): 2, ('E', 'D'): 2, ('Y', 'R'): -2,
('M', 'Q'): 0, ('T', 'I'): -1, ('C', 'D'): -3, ('V', 'F'): -1,
('T', 'A'): 0, ('T', 'P'): -1, ('B', 'P'): -2, ('T', 'E'): -1,
('V', 'N'): -3, ('P', 'G'): -2, ('M', 'A'): -1, ('K', 'H'): -1,
('V', 'R'): -3, ('P', 'C'): -3, ('M', 'E'): -2, ('K', 'L'): -2,
('V', 'V'): 4, ('M', 'I'): 1, ('T', 'Q'): -1, ('I', 'G'): -4,
('P', 'K'): -1, ('M', 'M'): 5, ('K', 'D'): -1, ('I', 'C'): -1,
('Z', 'D'): 1, ('F', 'R'): -3, ('X', 'K'): -1, ('Q', 'D'): 0,
('X', 'G'): -1, ('Z', 'L'): -3, ('X', 'C'): -2, ('Z', 'H'): 0,
('B', 'L'): -4, ('B', 'H'): 0, ('F', 'F'): 6, ('X', 'W'): -2,
('B', 'D'): 4, ('D', 'A'): -2, ('S', 'L'): -2, ('X', 'S'): 0,
('F', 'N'): -3, ('S', 'R'): -1, ('W', 'D'): -4, ('V', 'Y'): -1,
('W', 'L'): -2, ('H', 'R'): 0, ('W', 'H'): -2, ('H', 'N'): 1,
('W', 'T'): -2, ('T', 'T'): 5, ('S', 'F'): -2, ('W', 'P'): -4,
('L', 'D'): -4, ('B', 'I'): -3, ('L', 'H'): -3, ('S', 'N'): 1,
('B', 'T'): -1, ('L', 'L'): 4, ('Y', 'K'): -2, ('E', 'Q'): 2,
('Y', 'G'): -3, ('Z', 'S'): 0, ('Y', 'C'): -2, ('G', 'D'): -1,
('B', 'V'): -3, ('E', 'A'): -1, ('Y', 'W'): 2, ('E', 'E'): 5,
('Y', 'S'): -2, ('C', 'N'): -3, ('V', 'C'): -1, ('T', 'H'): -2,
('P', 'R'): -2, ('V', 'G'): -3, ('T', 'L'): -1, ('V', 'K'): -2,
('K', 'Q'): 1, ('R', 'A'): -1, ('I', 'R'): -3, ('T', 'D'): -1,
('P', 'F'): -4, ('I', 'N'): -3, ('K', 'I'): -3, ('M', 'D'): -3,
('V', 'W'): -3, ('W', 'W'): 11, ('M', 'H'): -2, ('P', 'N'): -2,
('K', 'A'): -1, ('M', 'L'): 2, ('K', 'E'): 1, ('Z', 'E'): 4,
('X', 'N'): -1, ('Z', 'A'): -1, ('Z', 'M'): -1, ('X', 'F'): -1,
('K', 'C'): -3, ('B', 'Q'): 0, ('X', 'B'): -1, ('B', 'M'): -3,
('F', 'C'): -2, ('Z', 'Q'): 3, ('X', 'Z'): -1, ('F', 'G'): -3,
('B', 'E'): 1, ('X', 'V'): -1, ('F', 'K'): -3, ('B', 'A'): -2,
('X', 'R'): -1, ('D', 'D'): 6, ('W', 'G'): -2, ('Z', 'F'): -3,
('S', 'Q'): 0, ('W', 'C'): -2, ('W', 'K'): -3, ('H', 'Q'): 0,
('L', 'C'): -1, ('W', 'N'): -4, ('S', 'A'): 1, ('L', 'G'): -4,
('W', 'S'): -3, ('S', 'E'): 0, ('H', 'E'): 0, ('S', 'I'): -2,
('H', 'A'): -2, ('S', 'M'): -1, ('Y', 'L'): -1, ('Y', 'H'): 2,
('Y', 'D'): -3, ('E', 'R'): 0, ('X', 'P'): -2, ('G', 'G'): 6,
('G', 'C'): -3, ('E', 'N'): 0, ('Y', 'T'): -2, ('Y', 'P'): -3,
('T', 'K'): -1, ('A', 'A'): 4, ('P', 'Q'): -1, ('T', 'C'): -1,
('V', 'H'): -3, ('T', 'G'): -2, ('I', 'Q'): -3, ('Z', 'T'): -1,
('C', 'R'): -3, ('V', 'P'): -2, ('P', 'E'): -1, ('M', 'C'): -1,
('K', 'N'): 0, ('I', 'I'): 4, ('P', 'A'): -1, ('M', 'G'): -3,
('T', 'S'): 1, ('I', 'E'): -3, ('P', 'M'): -2, ('M', 'K'): -1,
('I', 'A'): -1, ('P', 'I'): -3, ('R', 'R'): 5, ('X', 'M'): -1,
('L', 'I'): 2, ('X', 'I'): -1, ('Z', 'B'): 1, ('X', 'E'): -1,
('Z', 'N'): 0, ('X', 'A'): 0, ('B', 'R'): -1, ('B', 'N'): 3,
('F', 'D'): -3, ('X', 'Y'): -1, ('Z', 'R'): 0, ('F', 'H'): -1,
('B', 'F'): -3, ('F', 'L'): 0, ('X', 'Q'): -1, ('B', 'B'): 4
(
}
def insert_gap(sequence: str, position: int) -> str:
"""Insert a gap ('-') into the sequence at the specified position."""
return sequence[:position] + "-" + sequence[position:]
def global_alignment_affine(seq1: str, seq2: str, gap_open_penalty: int = -11, gap_extend_penalty: int = -1) -> Tuple[int, str, str]:
"""
Perform global sequence alignment with affine gap penalty.
Args:
seq1 (str): First sequence to align
seq2 (str): Second sequence to align
gap_open_penalty (int): Penalty for opening a gap
gap_extend_penalty (int): Penalty for extending a gap
Returns:
Tuple[int, str, str]: Alignment score and aligned sequences
"""
str, str], int] = BLOSUM62
scoring_matrix: Dict[Tuple[int, int], int] = {}
match_score: Dict[Tuple[int, int], int] = {}
gap_seq1_score: Dict[Tuple[int, int], int] = {}
gap_seq2_score: Dict[Tuple[int, int], int] = {}
prev_match: Dict[Tuple[int, int], int] = {}
prev_gap_seq1: Dict[Tuple[int, int], int] = {}
prev_gap_seq2: Dict[Tuple[
# Initialize matrices
0, 0] = match_score[0, 0] = gap_seq2_score[0, 0] = 0
gap_seq1_score[for i in range(1, len(seq1) + 1):
0] = gap_open_penalty + (i - 1) * gap_extend_penalty
gap_seq1_score[i, 0] = gap_open_penalty + (i - 1) * gap_extend_penalty
match_score[i, 0] = gap_open_penalty * 10 # Large penalty to avoid this case
gap_seq2_score[i, for j in range(1, len(seq2) + 1):
0, j] = gap_open_penalty + (j - 1) * gap_extend_penalty
gap_seq2_score[0, j] = gap_open_penalty + (j - 1) * gap_extend_penalty
match_score[0, j] = gap_open_penalty * 10 # Large penalty to avoid this case
gap_seq1_score[
# Fill matrices
for i in range(1, len(seq1) + 1):
for j in range(1, len(seq2) + 1):
# Calculate scores for gap in seq1
int] = [
gap_seq1_options: List[- 1, j] + gap_extend_penalty,
gap_seq1_score[i - 1, j] + gap_open_penalty
match_score[i
]= max(gap_seq1_options)
gap_seq1_score[i, j] = gap_seq1_options.index(gap_seq1_score[i, j])
prev_gap_seq1[i, j]
# Calculate scores for gap in seq2
int] = [
gap_seq2_options: List[- 1] + gap_extend_penalty,
gap_seq2_score[i, j - 1] + gap_open_penalty
match_score[i, j
]= max(gap_seq2_options)
gap_seq2_score[i, j] = gap_seq2_options.index(gap_seq2_score[i, j])
prev_gap_seq2[i, j]
# Calculate match/mismatch score
int = scoring_matrix.get((seq1[i-1], seq2[j-1]), scoring_matrix.get((seq2[j-1], seq1[i-1]), 0))
blosum_score: int] = [
match_options: List[
gap_seq1_score[i, j],- 1, j - 1] + blosum_score,
match_score[i
gap_seq2_score[i, j]
]= max(match_options)
match_score[i, j] = match_options.index(match_score[i, j])
prev_match[i, j]
# Traceback
= len(seq1), len(seq2)
i, j = seq1, seq2
aligned_seq1, aligned_seq2
int] = [gap_seq1_score[i, j], match_score[i, j], gap_seq2_score[i, j]]
scores: List[int = max(scores)
max_score: int = scores.index(max_score)
current_matrix:
while i * j != 0:
if current_matrix == 0: # In gap_seq1_score matrix
if prev_gap_seq1[i, j] == 1:
= 1
current_matrix -= 1
i = insert_gap(aligned_seq2, j)
aligned_seq2 elif current_matrix == 1: # In match_score matrix
if prev_match[i, j] == 1:
-= 1
i -= 1
j else:
= prev_match[i, j]
current_matrix else: # In gap_seq2_score matrix
if prev_gap_seq2[i, j] == 1:
= 1
current_matrix -= 1
j = insert_gap(aligned_seq1, i)
aligned_seq1
# Handle remaining overhangs
while i > 0:
= insert_gap(aligned_seq2, 0)
aligned_seq2 -= 1
i while j > 0:
= insert_gap(aligned_seq1, 0)
aligned_seq1 -= 1
j
return max_score, aligned_seq1, aligned_seq2
# Sample usage
str = """
sample_input: PRTEINS
PRTWPSEIN
"""
= sample_input.strip().split("\n")
seq1, seq2 = global_alignment_affine(seq1, seq2)
alignment_score, aligned_seq1, aligned_seq2 print(alignment_score, aligned_seq1, aligned_seq2, sep="\n")
58 Find a Middle Edge in an Alignment Graph in Linear Space
Middle Edge in Linear Space Problem. Find a middle edge in the alignment graph in linear space.
Given: Two amino acid strings.
Return: A middle edge in the alignment graph of these strings, where the optimal path is defined by the BLOSUM62 scoring matrix and a linear indel penalty equal to 5. Return the middle edge in the form”(i, j) (k, l)“, where (i, j) connects to (k, l).
58.1 Sample Dataset
PLEASANTLY
MEASNLY
58.2 Sample Output
(4, 3) (5, 4)
58.3 Solution
from math import floor
from typing import Dict, Tuple, List
str, str], int] = {
BLOSUM62: Dict[Tuple['W', 'F'): 1, ('L', 'R'): -2, ('S', 'P'): -1, ('V', 'T'): 0,
('Q', 'Q'): 5, ('N', 'A'): -2, ('Z', 'Y'): -2, ('W', 'R'): -3,
('Q', 'A'): -1, ('S', 'D'): 0, ('H', 'H'): 8, ('S', 'H'): -1,
('H', 'D'): -1, ('L', 'N'): -3, ('W', 'A'): -3, ('Y', 'M'): -1,
('G', 'R'): -2, ('Y', 'I'): -1, ('Y', 'E'): -2, ('B', 'Y'): -3,
('Y', 'A'): -2, ('V', 'D'): -3, ('B', 'S'): 0, ('Y', 'Y'): 7,
('G', 'N'): 0, ('E', 'C'): -4, ('Y', 'Q'): -1, ('Z', 'Z'): 4,
('V', 'A'): 0, ('C', 'C'): 9, ('M', 'R'): -1, ('V', 'E'): -2,
('T', 'N'): 0, ('P', 'P'): 7, ('V', 'I'): 3, ('V', 'S'): -2,
('Z', 'P'): -1, ('V', 'M'): 1, ('T', 'F'): -2, ('V', 'Q'): -2,
('K', 'K'): 5, ('P', 'D'): -1, ('I', 'H'): -3, ('I', 'D'): -3,
('T', 'R'): -1, ('P', 'L'): -3, ('K', 'G'): -2, ('M', 'N'): -2,
('P', 'H'): -2, ('F', 'Q'): -3, ('Z', 'G'): -2, ('X', 'L'): -1,
('T', 'M'): -1, ('Z', 'C'): -3, ('X', 'H'): -1, ('D', 'R'): -2,
('B', 'W'): -4, ('X', 'D'): -1, ('Z', 'K'): 1, ('F', 'A'): -2,
('Z', 'W'): -3, ('F', 'E'): -3, ('D', 'N'): 1, ('B', 'K'): 0,
('X', 'X'): -1, ('F', 'I'): 0, ('B', 'G'): -1, ('X', 'T'): 0,
('F', 'M'): 0, ('B', 'C'): -3, ('Z', 'I'): -3, ('Z', 'V'): -2,
('S', 'S'): 4, ('L', 'Q'): -2, ('W', 'E'): -3, ('Q', 'R'): 1,
('N', 'N'): 6, ('W', 'M'): -1, ('Q', 'C'): -3, ('W', 'I'): -3,
('S', 'C'): -1, ('L', 'A'): -1, ('S', 'G'): 0, ('L', 'E'): -3,
('W', 'Q'): -2, ('H', 'G'): -2, ('S', 'K'): 0, ('Q', 'N'): 0,
('N', 'R'): 0, ('H', 'C'): -3, ('Y', 'N'): -2, ('G', 'Q'): -2,
('Y', 'F'): 3, ('C', 'A'): 0, ('V', 'L'): 1, ('G', 'E'): -2,
('G', 'A'): 0, ('K', 'R'): 2, ('E', 'D'): 2, ('Y', 'R'): -2,
('M', 'Q'): 0, ('T', 'I'): -1, ('C', 'D'): -3, ('V', 'F'): -1,
('T', 'A'): 0, ('T', 'P'): -1, ('B', 'P'): -2, ('T', 'E'): -1,
('V', 'N'): -3, ('P', 'G'): -2, ('M', 'A'): -1, ('K', 'H'): -1,
('V', 'R'): -3, ('P', 'C'): -3, ('M', 'E'): -2, ('K', 'L'): -2,
('V', 'V'): 4, ('M', 'I'): 1, ('T', 'Q'): -1, ('I', 'G'): -4,
('P', 'K'): -1, ('M', 'M'): 5, ('K', 'D'): -1, ('I', 'C'): -1,
('Z', 'D'): 1, ('F', 'R'): -3, ('X', 'K'): -1, ('Q', 'D'): 0,
('X', 'G'): -1, ('Z', 'L'): -3, ('X', 'C'): -2, ('Z', 'H'): 0,
('B', 'L'): -4, ('B', 'H'): 0, ('F', 'F'): 6, ('X', 'W'): -2,
('B', 'D'): 4, ('D', 'A'): -2, ('S', 'L'): -2, ('X', 'S'): 0,
('F', 'N'): -3, ('S', 'R'): -1, ('W', 'D'): -4, ('V', 'Y'): -1,
('W', 'L'): -2, ('H', 'R'): 0, ('W', 'H'): -2, ('H', 'N'): 1,
('W', 'T'): -2, ('T', 'T'): 5, ('S', 'F'): -2, ('W', 'P'): -4,
('L', 'D'): -4, ('B', 'I'): -3, ('L', 'H'): -3, ('S', 'N'): 1,
('B', 'T'): -1, ('L', 'L'): 4, ('Y', 'K'): -2, ('E', 'Q'): 2,
('Y', 'G'): -3, ('Z', 'S'): 0, ('Y', 'C'): -2, ('G', 'D'): -1,
('B', 'V'): -3, ('E', 'A'): -1, ('Y', 'W'): 2, ('E', 'E'): 5,
('Y', 'S'): -2, ('C', 'N'): -3, ('V', 'C'): -1, ('T', 'H'): -2,
('P', 'R'): -2, ('V', 'G'): -3, ('T', 'L'): -1, ('V', 'K'): -2,
('K', 'Q'): 1, ('R', 'A'): -1, ('I', 'R'): -3, ('T', 'D'): -1,
('P', 'F'): -4, ('I', 'N'): -3, ('K', 'I'): -3, ('M', 'D'): -3,
('V', 'W'): -3, ('W', 'W'): 11, ('M', 'H'): -2, ('P', 'N'): -2,
('K', 'A'): -1, ('M', 'L'): 2, ('K', 'E'): 1, ('Z', 'E'): 4,
('X', 'N'): -1, ('Z', 'A'): -1, ('Z', 'M'): -1, ('X', 'F'): -1,
('K', 'C'): -3, ('B', 'Q'): 0, ('X', 'B'): -1, ('B', 'M'): -3,
('F', 'C'): -2, ('Z', 'Q'): 3, ('X', 'Z'): -1, ('F', 'G'): -3,
('B', 'E'): 1, ('X', 'V'): -1, ('F', 'K'): -3, ('B', 'A'): -2,
('X', 'R'): -1, ('D', 'D'): 6, ('W', 'G'): -2, ('Z', 'F'): -3,
('S', 'Q'): 0, ('W', 'C'): -2, ('W', 'K'): -3, ('H', 'Q'): 0,
('L', 'C'): -1, ('W', 'N'): -4, ('S', 'A'): 1, ('L', 'G'): -4,
('W', 'S'): -3, ('S', 'E'): 0, ('H', 'E'): 0, ('S', 'I'): -2,
('H', 'A'): -2, ('S', 'M'): -1, ('Y', 'L'): -1, ('Y', 'H'): 2,
('Y', 'D'): -3, ('E', 'R'): 0, ('X', 'P'): -2, ('G', 'G'): 6,
('G', 'C'): -3, ('E', 'N'): 0, ('Y', 'T'): -2, ('Y', 'P'): -3,
('T', 'K'): -1, ('A', 'A'): 4, ('P', 'Q'): -1, ('T', 'C'): -1,
('V', 'H'): -3, ('T', 'G'): -2, ('I', 'Q'): -3, ('Z', 'T'): -1,
('C', 'R'): -3, ('V', 'P'): -2, ('P', 'E'): -1, ('M', 'C'): -1,
('K', 'N'): 0, ('I', 'I'): 4, ('P', 'A'): -1, ('M', 'G'): -3,
('T', 'S'): 1, ('I', 'E'): -3, ('P', 'M'): -2, ('M', 'K'): -1,
('I', 'A'): -1, ('P', 'I'): -3, ('R', 'R'): 5, ('X', 'M'): -1,
('L', 'I'): 2, ('X', 'I'): -1, ('Z', 'B'): 1, ('X', 'E'): -1,
('Z', 'N'): 0, ('X', 'A'): 0, ('B', 'R'): -1, ('B', 'N'): 3,
('F', 'D'): -3, ('X', 'Y'): -1, ('Z', 'R'): 0, ('F', 'H'): -1,
('B', 'F'): -3, ('F', 'L'): 0, ('X', 'Q'): -1, ('B', 'B'): 4
(
}
def calculate_alignment_scores(
str,
sequence1: str,
sequence2: str, str], int],
scoring_matrix: Dict[Tuple[int
gap_penalty: -> Tuple[List[int], List[int]]:
) = list(range(0, (len(sequence1) + 1) * gap_penalty, gap_penalty))
current_scores = [0] * (len(sequence1) + 1)
backtrack
for j in range(1, len(sequence2) + 1):
= current_scores[:]
previous_scores 0] = previous_scores[0] + gap_penalty
current_scores[for i in range(1, len(sequence1) + 1):
= [
options + gap_penalty,
previous_scores[i] - 1] + gap_penalty,
current_scores[i - 1] + scoring_matrix.get(
previous_scores[i - 1], sequence2[j - 1]),
(sequence1[i - 1], sequence1[i - 1]), 0)
scoring_matrix.get((sequence2[j
),
]= max(options)
current_scores[i] = options.index(current_scores[i])
backtrack[i]
return current_scores, backtrack
def find_middle_edge(
str,
sequence1: str,
sequence2: str, str], int],
scoring_matrix: Dict[Tuple[int
gap_penalty: -> Tuple[Tuple[int, int], Tuple[int, int]]:
) = floor(len(sequence2) / 2)
midpoint
= calculate_alignment_scores(
forward_scores, _
sequence1, sequence2[:midpoint], scoring_matrix, gap_penalty
)= calculate_alignment_scores(
reverse_scores, reverse_backtrack -1], sequence2[midpoint:][::-1], scoring_matrix, gap_penalty
sequence1[::
)
= [f + r for f, r in zip(forward_scores, reverse_scores[::-1])]
total_scores = total_scores.index(max(total_scores))
best_score_index
= (best_score_index, midpoint)
start_node = [
possible_moves 0], start_node[1] + 1),
(start_node[0] + 1, start_node[1]),
(start_node[0] + 1, start_node[1] + 1)
(start_node[
]= possible_moves[reverse_backtrack[::-1][best_score_index]]
end_node
return (start_node, end_node)
# Sample usage
str = """
sample_input: PLEASANTLY
MEASNLY
"""
= sample_input.strip().split("\n")
sequence1, sequence2 = find_middle_edge(sequence1, sequence2, BLOSUM62, -5)
result print(result)
59 Align Two Strings Using Linear Space
Global Alignment in Linear Space Problem. Find the highest-scoring alignment between two strings using a scoring matrix in linear space.
Given: Two long amino acid strings (of length approximately 10,000).
Return: The maximum alignment score of these strings, followed by an alignment achieving this maximum score. Use the BLOSUM62 scoring matrix and indel penalty σ = 5.
59.1 Sample Dataset
PLEASANTLY
MEANLY
59.2 Sample Output
8
PLEASANTLY
-MEA--N-LY
59.3 Solution
from math import floor
from typing import Dict, Tuple, List
str, str], int] = {
BLOSUM62: Dict[Tuple['W', 'F'): 1, ('L', 'R'): -2, ('S', 'P'): -1, ('V', 'T'): 0,
('Q', 'Q'): 5, ('N', 'A'): -2, ('Z', 'Y'): -2, ('W', 'R'): -3,
('Q', 'A'): -1, ('S', 'D'): 0, ('H', 'H'): 8, ('S', 'H'): -1,
('H', 'D'): -1, ('L', 'N'): -3, ('W', 'A'): -3, ('Y', 'M'): -1,
('G', 'R'): -2, ('Y', 'I'): -1, ('Y', 'E'): -2, ('B', 'Y'): -3,
('Y', 'A'): -2, ('V', 'D'): -3, ('B', 'S'): 0, ('Y', 'Y'): 7,
('G', 'N'): 0, ('E', 'C'): -4, ('Y', 'Q'): -1, ('Z', 'Z'): 4,
('V', 'A'): 0, ('C', 'C'): 9, ('M', 'R'): -1, ('V', 'E'): -2,
('T', 'N'): 0, ('P', 'P'): 7, ('V', 'I'): 3, ('V', 'S'): -2,
('Z', 'P'): -1, ('V', 'M'): 1, ('T', 'F'): -2, ('V', 'Q'): -2,
('K', 'K'): 5, ('P', 'D'): -1, ('I', 'H'): -3, ('I', 'D'): -3,
('T', 'R'): -1, ('P', 'L'): -3, ('K', 'G'): -2, ('M', 'N'): -2,
('P', 'H'): -2, ('F', 'Q'): -3, ('Z', 'G'): -2, ('X', 'L'): -1,
('T', 'M'): -1, ('Z', 'C'): -3, ('X', 'H'): -1, ('D', 'R'): -2,
('B', 'W'): -4, ('X', 'D'): -1, ('Z', 'K'): 1, ('F', 'A'): -2,
('Z', 'W'): -3, ('F', 'E'): -3, ('D', 'N'): 1, ('B', 'K'): 0,
('X', 'X'): -1, ('F', 'I'): 0, ('B', 'G'): -1, ('X', 'T'): 0,
('F', 'M'): 0, ('B', 'C'): -3, ('Z', 'I'): -3, ('Z', 'V'): -2,
('S', 'S'): 4, ('L', 'Q'): -2, ('W', 'E'): -3, ('Q', 'R'): 1,
('N', 'N'): 6, ('W', 'M'): -1, ('Q', 'C'): -3, ('W', 'I'): -3,
('S', 'C'): -1, ('L', 'A'): -1, ('S', 'G'): 0, ('L', 'E'): -3,
('W', 'Q'): -2, ('H', 'G'): -2, ('S', 'K'): 0, ('Q', 'N'): 0,
('N', 'R'): 0, ('H', 'C'): -3, ('Y', 'N'): -2, ('G', 'Q'): -2,
('Y', 'F'): 3, ('C', 'A'): 0, ('V', 'L'): 1, ('G', 'E'): -2,
('G', 'A'): 0, ('K', 'R'): 2, ('E', 'D'): 2, ('Y', 'R'): -2,
('M', 'Q'): 0, ('T', 'I'): -1, ('C', 'D'): -3, ('V', 'F'): -1,
('T', 'A'): 0, ('T', 'P'): -1, ('B', 'P'): -2, ('T', 'E'): -1,
('V', 'N'): -3, ('P', 'G'): -2, ('M', 'A'): -1, ('K', 'H'): -1,
('V', 'R'): -3, ('P', 'C'): -3, ('M', 'E'): -2, ('K', 'L'): -2,
('V', 'V'): 4, ('M', 'I'): 1, ('T', 'Q'): -1, ('I', 'G'): -4,
('P', 'K'): -1, ('M', 'M'): 5, ('K', 'D'): -1, ('I', 'C'): -1,
('Z', 'D'): 1, ('F', 'R'): -3, ('X', 'K'): -1, ('Q', 'D'): 0,
('X', 'G'): -1, ('Z', 'L'): -3, ('X', 'C'): -2, ('Z', 'H'): 0,
('B', 'L'): -4, ('B', 'H'): 0, ('F', 'F'): 6, ('X', 'W'): -2,
('B', 'D'): 4, ('D', 'A'): -2, ('S', 'L'): -2, ('X', 'S'): 0,
('F', 'N'): -3, ('S', 'R'): -1, ('W', 'D'): -4, ('V', 'Y'): -1,
('W', 'L'): -2, ('H', 'R'): 0, ('W', 'H'): -2, ('H', 'N'): 1,
('W', 'T'): -2, ('T', 'T'): 5, ('S', 'F'): -2, ('W', 'P'): -4,
('L', 'D'): -4, ('B', 'I'): -3, ('L', 'H'): -3, ('S', 'N'): 1,
('B', 'T'): -1, ('L', 'L'): 4, ('Y', 'K'): -2, ('E', 'Q'): 2,
('Y', 'G'): -3, ('Z', 'S'): 0, ('Y', 'C'): -2, ('G', 'D'): -1,
('B', 'V'): -3, ('E', 'A'): -1, ('Y', 'W'): 2, ('E', 'E'): 5,
('Y', 'S'): -2, ('C', 'N'): -3, ('V', 'C'): -1, ('T', 'H'): -2,
('P', 'R'): -2, ('V', 'G'): -3, ('T', 'L'): -1, ('V', 'K'): -2,
('K', 'Q'): 1, ('R', 'A'): -1, ('I', 'R'): -3, ('T', 'D'): -1,
('P', 'F'): -4, ('I', 'N'): -3, ('K', 'I'): -3, ('M', 'D'): -3,
('V', 'W'): -3, ('W', 'W'): 11, ('M', 'H'): -2, ('P', 'N'): -2,
('K', 'A'): -1, ('M', 'L'): 2, ('K', 'E'): 1, ('Z', 'E'): 4,
('X', 'N'): -1, ('Z', 'A'): -1, ('Z', 'M'): -1, ('X', 'F'): -1,
('K', 'C'): -3, ('B', 'Q'): 0, ('X', 'B'): -1, ('B', 'M'): -3,
('F', 'C'): -2, ('Z', 'Q'): 3, ('X', 'Z'): -1, ('F', 'G'): -3,
('B', 'E'): 1, ('X', 'V'): -1, ('F', 'K'): -3, ('B', 'A'): -2,
('X', 'R'): -1, ('D', 'D'): 6, ('W', 'G'): -2, ('Z', 'F'): -3,
('S', 'Q'): 0, ('W', 'C'): -2, ('W', 'K'): -3, ('H', 'Q'): 0,
('L', 'C'): -1, ('W', 'N'): -4, ('S', 'A'): 1, ('L', 'G'): -4,
('W', 'S'): -3, ('S', 'E'): 0, ('H', 'E'): 0, ('S', 'I'): -2,
('H', 'A'): -2, ('S', 'M'): -1, ('Y', 'L'): -1, ('Y', 'H'): 2,
('Y', 'D'): -3, ('E', 'R'): 0, ('X', 'P'): -2, ('G', 'G'): 6,
('G', 'C'): -3, ('E', 'N'): 0, ('Y', 'T'): -2, ('Y', 'P'): -3,
('T', 'K'): -1, ('A', 'A'): 4, ('P', 'Q'): -1, ('T', 'C'): -1,
('V', 'H'): -3, ('T', 'G'): -2, ('I', 'Q'): -3, ('Z', 'T'): -1,
('C', 'R'): -3, ('V', 'P'): -2, ('P', 'E'): -1, ('M', 'C'): -1,
('K', 'N'): 0, ('I', 'I'): 4, ('P', 'A'): -1, ('M', 'G'): -3,
('T', 'S'): 1, ('I', 'E'): -3, ('P', 'M'): -2, ('M', 'K'): -1,
('I', 'A'): -1, ('P', 'I'): -3, ('R', 'R'): 5, ('X', 'M'): -1,
('L', 'I'): 2, ('X', 'I'): -1, ('Z', 'B'): 1, ('X', 'E'): -1,
('Z', 'N'): 0, ('X', 'A'): 0, ('B', 'R'): -1, ('B', 'N'): 3,
('F', 'D'): -3, ('X', 'Y'): -1, ('Z', 'R'): 0, ('F', 'H'): -1,
('B', 'F'): -3, ('F', 'L'): 0, ('X', 'Q'): -1, ('B', 'B'): 4
(
}
def calculate_alignment_scores(
str,
sequence1: str,
sequence2: str, str], int],
scoring_matrix: Dict[Tuple[int
gap_penalty: -> Tuple[List[int], List[int]]:
) = list(range(0, (len(sequence1) + 1) * gap_penalty, gap_penalty))
current_scores = [0] * (len(sequence1) + 1)
backtrack
for j in range(1, len(sequence2) + 1):
= current_scores[:]
previous_scores 0] = previous_scores[0] + gap_penalty
current_scores[for i in range(1, len(sequence1) + 1):
= [
options + gap_penalty,
previous_scores[i] - 1] + gap_penalty,
current_scores[i - 1] + scoring_matrix.get(
previous_scores[i - 1], sequence2[j - 1]),
(sequence1[i - 1], sequence1[i - 1]), 0)
scoring_matrix.get((sequence2[j
),
]= max(options)
current_scores[i] = options.index(current_scores[i])
backtrack[i]
return current_scores, backtrack
def find_middle_edge(
str,
sequence1: str,
sequence2: str, str], int],
scoring_matrix: Dict[Tuple[int
gap_penalty: -> Tuple[Tuple[int, int], Tuple[int, int]]:
) = floor(len(sequence2) / 2)
midpoint
= calculate_alignment_scores(
forward_scores, _
sequence1, sequence2[:midpoint], scoring_matrix, gap_penalty
)= calculate_alignment_scores(
reverse_scores, reverse_backtrack -1], sequence2[midpoint:][::-1], scoring_matrix, gap_penalty
sequence1[::
)
= [f + r for f, r in zip(forward_scores, reverse_scores[::-1])]
total_scores = total_scores.index(max(total_scores))
best_score_index
= (best_score_index, midpoint)
start_node = [
possible_moves 0], start_node[1] + 1),
(start_node[0] + 1, start_node[1]),
(start_node[0] + 1, start_node[1] + 1)
(start_node[
]= possible_moves[reverse_backtrack[::-1][best_score_index]]
end_node
return (start_node, end_node)
def calculate_alignment_score(
str,
aligned_seq1: str,
aligned_seq2: str, str], int],
scoring_matrix: Dict[Tuple[int
gap_penalty: -> int:
) return sum(
if aligned_seq1[i] == "-" or aligned_seq2[i] == "-" else
gap_penalty
scoring_matrix.get((aligned_seq1[i], aligned_seq2[i]),0))
scoring_matrix.get((aligned_seq2[i], aligned_seq1[i]), for i in range(len(aligned_seq1))
)
def find_alignment_path(
str,
sequence1: str,
sequence2: str, str], int],
scoring_matrix: Dict[Tuple[int
gap_penalty: -> str:
) def linear_space_alignment(top: int, bottom: int, left: int, right: int) -> str:
if left == right:
return "↓" * (bottom - top)
elif top == bottom:
return "→" * (right - left)
else:
= find_middle_edge(
((i, j), (i2, j2))
sequence1[top:bottom], sequence2[left:right], scoring_matrix, gap_penalty
)= "↓" if j == j2 else "→" if i == i2 else "↘"
edge return (
+ top, left, j + left) +
linear_space_alignment(top, i +
edge + top, bottom, j2 + left, right)
linear_space_alignment(i2
)
return linear_space_alignment(0, len(sequence1), 0, len(sequence2))
def construct_alignment(
str,
alignment_path: str,
sequence1: str
sequence2: -> Tuple[str, str]:
) = "", ""
aligned_seq1, aligned_seq2 = 0, 0
i, j for direction in alignment_path:
if direction == "↘":
+= sequence1[i]
aligned_seq1 += sequence2[j]
aligned_seq2 += 1
i += 1
j elif direction == "↓":
+= sequence1[i]
aligned_seq1 += "-"
aligned_seq2 += 1
i else:
+= "-"
aligned_seq1 += sequence2[j]
aligned_seq2 += 1
j return aligned_seq1, aligned_seq2
# Sample usage
str = """
sample_input: PLEASANTLY
MEANLY
"""
= sample_input.strip().split("\n")
sequence1, sequence2 = BLOSUM62
scoring_matrix = find_alignment_path(sequence1, sequence2, scoring_matrix, -5)
alignment_path = construct_alignment(alignment_path, sequence1, sequence2)
aligned_seq1, aligned_seq2 print(calculate_alignment_score(aligned_seq1, aligned_seq2, scoring_matrix, -5))
print(aligned_seq1, aligned_seq2, sep="\n")
60 Find a Highest-Scoring Multiple Sequence Alignment
Multiple Longest Common Subsequence Problem. Find a longest common subsequence of multiple strings.
Given: Three DNA strings.
Return: The maximum score of a multiple alignment of these three strings, followed by a multiple alignment of the three strings achieving this maximum. Use a scoring function in which the score of an alignment column is 1 if all three symbols are identical and 0 otherwise. (If more than one multiple alignment achieve the maximum, you may return any one.)
60.1 Sample Dataset
ATATCCG
TCCGA
ATGTACTG
60.2 Sample Output
3
ATATCC-G-
---TCC-GA
ATGTACTG-
60.3 Solution
from itertools import product
from typing import List, Tuple, Dict
# Check if the coordinates are non-negative in the alignment matrix
def is_valid_coordinate(pointer: Tuple[int, ...], position: Tuple[int, ...]) -> bool:
return all([i >= 0 for i in get_previous_position(position, pointer)])
# Get the previous position given a current position and a pointer
def get_previous_position(position: Tuple[int, ...], pointer: Tuple[int, ...]) -> Tuple[int, ...]:
return tuple([p + d for p, d in zip(position, pointer)])
# Calculate the score for a given position and pointer
def calculate_score(sequences: List[str], position: Tuple[int, ...], pointer: Tuple[int, ...]) -> int:
if pointer == (-1, -1, -1):
= [sequences[i][j] for i, j in enumerate(get_previous_position(position, pointer))]
bases return int(all(base == bases[0] for base in bases))
else:
return 0
# Generate possible previous cell pointers
def generate_moves(dimension: int) -> List[Tuple[int, ...]]:
return list(product([0, -1], repeat=dimension))[1:]
def multiple_sequence_alignment(sequences: List[str]) -> Tuple[int, str, str, str]:
int, ...], int] = {}
scores: Dict[Tuple[int, ...], Tuple[int, ...]] = {}
pointers: Dict[Tuple[0, 0, 0)] = 0
scores[(
= [range(0, len(seq) + 1) for seq in sequences]
ranges
for position in product(*ranges):
= list(filter(lambda x: is_valid_coordinate(x, position), generate_moves(3)))
valid_pointers if not valid_pointers:
continue
= [scores[get_previous_position(position, ptr)] + calculate_score(sequences, position, ptr) for ptr in valid_pointers]
possible_scores = max(possible_scores)
scores[position] = valid_pointers[possible_scores.index(max(possible_scores))]
pointers[position]
# Traceback to recover alignment
= scores[position]
total_score = ["", "", ""]
aligned_sequences
while any([x > 0 for x in position]):
= pointers[position]
pointer for i, seq in enumerate(sequences):
+= seq[position[i] - 1] if pointer[i] == -1 else "-"
aligned_sequences[i] = get_previous_position(position, pointer)
position
return (total_score,
0][::-1],
aligned_sequences[1][::-1],
aligned_sequences[2][::-1])
aligned_sequences[
= """
sample_input ATATCCG
TCCGA
ATGTACTG
"""
= sample_input.strip().split("\n")
sequences = multiple_sequence_alignment(sequences)
alignment_score, seq1, seq2, seq3 print(alignment_score)
print(seq1)
print(seq2)
print(seq3)
61 Find a Topological Ordering of a DAG
Topological Ordering Problem. Find a topological ordering of a directed acyclic graph.
Given: The adjacency list of a graph (with nodes represented by integers).
Return: A topological ordering of this graph.
61.1 Sample Dataset
1 -> 2
2 -> 3
4 -> 2
5 -> 3
61.2 Sample Output
1, 4, 5, 2, 3
61.3 Solution
from typing import Dict, List, Tuple, Set
= str
NodeLabel = Dict[NodeLabel, List[NodeLabel]]
Graph
def create_graph(edge_list: List[str]) -> Graph:
= {}
graph: Graph for edge in edge_list:
= edge.split(" -> ")
source, targets = targets.split(",")
target_list if source not in graph:
= []
graph[source]
graph[source].extend(target_list)for target in target_list:
if target not in graph:
= []
graph[target] return graph
def depth_first_search(graph: Graph, node: NodeLabel, visited: Set[NodeLabel], stack: List[NodeLabel]) -> None:
visited.add(node)for neighbor in graph.get(node, []):
if neighbor not in visited:
depth_first_search(graph, neighbor, visited, stack)0, node)
stack.insert(
def topological_sort(graph: Graph) -> List[NodeLabel]:
= set()
visited: Set[NodeLabel] = []
stack: List[NodeLabel] for node in graph:
if node not in visited:
depth_first_search(graph, node, visited, stack)return stack
def parse_input(input_text: str) -> List[str]:
return input_text.strip().split("\n")
# Sample usage
= """
sample_input 1 -> 2
2 -> 3
4 -> 2
5 -> 3
"""
= parse_input(sample_input)
edge_list = create_graph(edge_list)
graph = topological_sort(graph)
sorted_nodes print(", ".join(sorted_nodes))
62 Implement GreedySorting to Sort a Permutation by Reversals
Implement GreedySorting.
Given: A signed permutation P.
Return: The sequence of permutations corresponding to applying GreedySorting to P, ending with the identity permutation.
62.1 Sample Dataset
(-3 +4 +1 +5 -2)
62.2 Sample Output
(-1 -4 +3 +5 -2)
(+1 -4 +3 +5 -2)
(+1 +2 -5 -3 +4)
(+1 +2 +3 +5 +4)
(+1 +2 +3 -4 -5)
(+1 +2 +3 +4 -5)
(+1 +2 +3 +4 +5)
62.3 Solution
from typing import List, Tuple
def perform_k_sorting_reversal(permutation: List[int], k: int) -> List[int]:
= k
j while abs(permutation[j]) != k + 1:
+= 1
j +1] = [-x for x in reversed(permutation[k:j+1])]
permutation[k:jreturn permutation
def greedy_sorting(permutation: List[int]) -> List[List[int]]:
int]] = []
reversal_sequence: List[List[for k in range(len(permutation)):
while permutation[k] != k + 1:
= perform_k_sorting_reversal(permutation, k)
permutation list(permutation))
reversal_sequence.append(return reversal_sequence
def parse_permutation(input_text: str) -> List[int]:
= input_text.strip().replace("(", "").replace(")", "")
cleaned_input return [int(x) for x in cleaned_input.split()]
def format_permutation(permutation: List[int]) -> str:
= [("+" if x > 0 else "") + str(x) for x in permutation]
formatted_elements return "(" + " ".join(formatted_elements) + ")"
# Sample usage
= """
sample_input (-3 +4 +1 +5 -2)
"""
= parse_permutation(sample_input)
initial_permutation = greedy_sorting(initial_permutation)
sorting_sequence
for permutation in sorting_sequence:
print(format_permutation(permutation))
63 Compute the Number of Breakpoints in a Permutation
Number of Breakpoints Problem. Find the number of breakpoints in a permutation.
Given: A signed permutation P.
Return: The number of breakpoints in P.
63.1 Sample Dataset
(+3 +4 +5 -12 -8 -7 -6 +1 +2 +10 +9 -11 +13 +14)
63.2 Sample Output
8
63.3 Solution
from typing import List
def count_breakpoints(permutation: List[int]) -> int:
= [0] + permutation + [max(permutation) + 1]
augmented_permutation = 0
breakpoint_count
for i in range(1, len(augmented_permutation) - 1):
if augmented_permutation[i] != augmented_permutation[i - 1] + 1:
+= 1
breakpoint_count
return breakpoint_count
= """
sample_input (+3 +4 +5 -12 -8 -7 -6 +1 +2 +10 +9 -11 +13 +14)
"""
= sample_input.strip()
permutation_string = permutation_string.replace("(", "").replace(")", "")
permutation_string = [int(x) for x in permutation_string.split()]
permutation
print(count_breakpoints(permutation))
64 Compute the 2-Break Distance Between a Pair of Genomes
2-Break Distance Problem. Find the 2-break distance between two genomes.
Given: Two genomes with circular chromosomes on the same set of synteny blocks.
Return: The 2-break distance between these two genomes.
64.1 Sample Dataset
(+1 +2 +3 +4 +5 +6)
(+1 -3 -6 -5)(+2 -4)
64.2 Sample Output
3
64.3 Solution
import re
from collections import defaultdict
from typing import List, Dict, Set, DefaultDict
def find_component(start_node: int, graph: Dict[int, List[int]]) -> Set[int]:
int] = [start_node]
queue: List[int] = set()
visited: Set[while queue:
= queue.pop(0)
current_node
visited.add(current_node)for neighbor in graph[current_node]:
if neighbor not in visited:
queue.append(neighbor)return visited
def parse_genome_graph(genome_string: str) -> DefaultDict[int, List[int]]:
int, List[int]] = defaultdict(list)
genome_graph: DefaultDict[for component in re.findall(r"\((.+?)\)", genome_string):
= list(map(int, component.split()))
chromosome for i in range(len(chromosome) - 1):
-chromosome[i + 1])
genome_graph[chromosome[i]].append(-chromosome[i + 1]].append(chromosome[i])
genome_graph[-1]].append(-chromosome[0])
genome_graph[chromosome[-chromosome[0]].append(chromosome[-1])
genome_graph[return genome_graph
def breakpoint_graph(genome1: DefaultDict[int, List[int]], genome2: DefaultDict[int, List[int]]) -> Dict[int, List[int]]:
int, List[int]] = {}
combined_graph: Dict[for node in genome1.keys():
= genome1[node] + genome2[node]
combined_graph[node] return combined_graph
def calculate_two_break_distance(genomes: List[DefaultDict[int, List[int]]]) -> int:
= breakpoint_graph(*genomes)
combined_graph int] = set(combined_graph.keys())
nodes: Set[= len(nodes) // 2
num_blocks = 0
num_components while nodes:
= find_component(next(iter(nodes)), combined_graph)
component -= component
nodes += 1
num_components return num_blocks - num_components
= """
sample_input (+1 +2 +3 +4 +5 +6)
(+1 -3 -6 -5)(+2 -4)
"""
= sample_input.strip().split("\n")
input_lines = [parse_genome_graph(genome_string) for genome_string in input_lines]
genomes print(calculate_two_break_distance(genomes))
65 Find a Shortest Transformation of One Genome into Another by 2-Breaks
2-Break Sorting Problem. Find a shortest transformation of one genome into another by 2-breaks.
Given: Two genomes with circular chromosomes on the same set of synteny blocks.
Return: The sequence of genomes resulting from applying a shortest sequence of 2-breaks transforming one genome into the other.
65.1 Sample Dataset
(+1 -2 -3 +4)
(+1 +2 -4 -3)
65.2 Sample Output
(+1 -2 -3 +4)
(+1 +2 -3 +4)
(+1 +2 -4 +3)
(+1 +2 -4 -3)
65.3 Solution
import re
from collections import defaultdict
from typing import List, Dict, Set, Generator, DefaultDict
def find_component(start_node: int, graph: Dict[int, List[int]]) -> Set[int]:
int] = [start_node]
queue: List[int] = set()
visited: Set[while queue:
= queue.pop(0)
current_node
visited.add(current_node)for neighbor in graph[current_node]:
if neighbor not in visited:
queue.append(neighbor)return visited
def parse_genome_graph(genome_string: str) -> DefaultDict[int, List[int]]:
int, List[int]] = defaultdict(list)
genome_graph: DefaultDict[for component in re.findall(r"\((.+?)\)", genome_string):
= list(map(int, component.split()))
chromosome for i in range(len(chromosome) - 1):
-chromosome[i + 1])
genome_graph[chromosome[i]].append(-chromosome[i + 1]].append(chromosome[i])
genome_graph[-1]].append(-chromosome[0])
genome_graph[chromosome[-chromosome[0]].append(chromosome[-1])
genome_graph[return genome_graph
def breakpoint_graph(genome1: DefaultDict[int, List[int]], genome2: DefaultDict[int, List[int]]) -> Dict[int, List[int]]:
int, List[int]] = {}
combined_graph: Dict[for node in genome1.keys():
= genome1[node] + genome2[node]
combined_graph[node] return combined_graph
def format_perm(perm: List[int]) -> str:
return "(" + " ".join([f"{x:+}" for x in perm]) + ")"
def find_components(graph: Dict[int, List[int]]) -> Generator[Set[int], None, None]:
int] = set(graph.keys())
nodes: Set[while nodes:
= find_component(next(iter(nodes)), graph)
component = nodes - component
nodes yield component
def non_trivial_cycle_nodes(graph: Dict[int, List[int]]) -> List[int] | None:
for component in find_components(graph):
if len(component) > 2:
return list(component)
return None
def find_genome_component(start_node: int, graph: Dict[int, List[int]]) -> List[int]:
int] = [start_node]
queue: List[int] = []
visited: List[while queue:
= queue.pop(0)
current_node
visited.append(current_node)for neighbor in graph[current_node]:
if -neighbor not in visited:
-neighbor)
queue.append(return visited
def format_genome_graph(genome_graph: Dict[int, List[int]]) -> str:
int] = set(genome_graph.keys())
nodes: Set[int]] = []
components: List[List[while nodes:
= find_genome_component(next(iter(nodes)), genome_graph)
component = nodes - set(component)
nodes = nodes - set(-x for x in component)
nodes
components.append(component)return "".join([format_perm(c) for c in components])
def add_edge(graph: Dict[int, List[int]], node1: int, node2: int) -> None:
graph[node1].append(node2)
graph[node2].append(node1)
def del_edge(graph: Dict[int, List[int]], node1: int, node2: int) -> None:
graph[node1].remove(node2)
graph[node2].remove(node1)
def ba6d(genome1: DefaultDict[int, List[int]], genome2: DefaultDict[int, List[int]]) -> Generator[str, None, None]:
= breakpoint_graph(genome1, genome2)
combined_graph = non_trivial_cycle_nodes(combined_graph)
nodes yield format_genome_graph(genome1)
while nodes:
= nodes[0]
j = genome2[nodes[0]][0]
i2 = genome1[j][0]
i = genome1[i2][0]
j2
del_edge(genome1, i, j)
del_edge(genome1, i2, j2)
add_edge(genome1, j, i2)
add_edge(genome1, j2, i)
yield format_genome_graph(genome1)
= breakpoint_graph(genome1, genome2)
combined_graph = non_trivial_cycle_nodes(combined_graph)
nodes
= """
sample_input (+1 -2 -3 +4)
(+1 +2 -4 -3)
"""
= [parse_genome_graph(s) for s in sample_input.strip().split("\n")]
genome1, genome2 for genome in ba6d(genome1, genome2):
print(genome)
67 Implement ChromosomeToCycle
Chromosome To Cycle Problem. Solve the Chromosome To Cycle Problem.
Given: A chromosome Chromosome containing n synteny blocks.
Return: The sequence Nodes of integers between 1 and 2_n_ resulting from applying ChromosomeToCycle to Chromosome.
67.1 Sample Dataset
(+1 -2 -3 +4)
67.2 Sample Output
(1 2 4 3 6 5 7 8)
67.3 Solution
from typing import List
def parse_permutation(permutation_string: str) -> List[int]:
return list(map(int, permutation_string[1:-1].split()))
def chromosome_to_cycle(chromosome: List[int]) -> List[int]:
int] = []
nodes: List[for gene in chromosome:
if gene > 0:
2 * gene - 1, 2 * gene])
nodes.extend([else:
-2 * gene, -2 * gene - 1])
nodes.extend([return nodes
def convert_chromosome_to_cycle(chromosome_string: str) -> List[int]:
return chromosome_to_cycle(parse_permutation(chromosome_string))
def format_cycle(cycle: List[int]) -> str:
return "(" + " ".join(map(str, cycle)) + ")"
str = "(+1 -2 -3 +4)"
sample_input:
str = sample_input.strip()
chromosome_string: print(format_cycle(convert_chromosome_to_cycle(chromosome_string)))
68 Implement CycleToChromosome
Cycle To Chromosome Problem. Solve the Cycle to Chromosome Problem.
Given: A sequence Nodes of integers between 1 and 2_n_.
Return: The chromosome Chromosome containing n synteny blocks resulting from applying CycleToChromosome to Nodes.
68.1 Sample Dataset
(1 2 4 3 6 5 7 8)
68.2 Sample Output
(+1 -2 -3 +4)
68.3 Solution
from typing import List
def parse_cycle(cycle_string: str) -> List[int]:
return list(map(int, cycle_string[1:-1].split()))
def format_chromosome(chromosome: List[int]) -> str:
return "(" + " ".join([f"{gene:+}" for gene in chromosome]) + ")"
def cycle_to_chromosome(cycle: List[int]) -> List[int]:
int] = []
chromosome: List[for j1, j2 in zip(cycle[::2], cycle[1::2]):
if j1 < j2:
// 2)
chromosome.append(j2 else:
-j1 // 2)
chromosome.append(return chromosome
str = "(1 2 4 3 6 5 7 8)"
sample_input:
str = sample_input.strip()
cycle_string: int] = cycle_to_chromosome(parse_cycle(cycle_string))
chromosome: List[print(format_chromosome(chromosome))
69 Implement ColoredEdges
Colored Edges Problem. Find the Colored Edges in a genome.
Given: A genome P.
Return: The collection of colored edges in the genome graph of P in the form (x, y).
69.1 Sample Dataset
(+1 -2 -3)(+4 +5 -6)
69.2 Sample Output
(2, 4), (3, 6), (5, 1), (8, 9), (10, 12), (11, 7)
69.3 Solution
import re
from typing import List, Dict, Tuple
def chromosome_to_cycle(chromosome: List[int]) -> List[int]:
int] = []
nodes: List[for gene in chromosome:
if gene > 0:
2 * gene - 1, 2 * gene])
nodes.extend([else:
-2 * gene, -2 * gene - 1])
nodes.extend([return nodes
def parse_integers(string: str) -> List[int]:
return list(map(int, string.split()))
def get_colored_edges(genome: List[List[int]]) -> Dict[int, int]:
int, int] = {}
edge_dict: Dict[for chromosome in genome:
= chromosome_to_cycle(chromosome)
nodes for j in range(len(chromosome)):
= 2 * j + 1
start_index = (2 * j + 2) % len(nodes)
end_index = nodes[end_index]
edge_dict[nodes[start_index]] return edge_dict
str = "(+1 -2 -3)(+4 +5 -6)"
sample_input:
str = sample_input.strip()
genome_string: int]] = [parse_integers(x) for x in re.findall(r"\((.+?)\)", genome_string)]
genome: List[List[int, int]] = [(k, v) for k, v in get_colored_edges(genome).items()]
edges: List[Tuple[print(*edges, sep=", ")
70 Implement GraphToGenome
Graph To Genome Problem. Solve the Graph To Genome Problem.
Given: The colored edges of a genome graph.
Return: A genome corresponding to the genome graph.
70.1 Sample Dataset
(2, 4), (3, 6), (5, 1), (7, 9), (10, 12), (11, 8)
70.2 Sample Output
(+1 -2 -3)(-4 +5 -6)
70.3 Solution
import re
from typing import List, Dict, Tuple, Iterator
from copy import copy
def format_chromosome(chromosome: List[int]) -> str:
return "(" + " ".join([f"{gene:+}" for gene in chromosome]) + ")"
def cycle_to_chromosome(cycle: List[int]) -> List[int]:
int] = []
chromosome: List[for j1, j2 in zip(cycle[::2], cycle[1::2]):
if j1 < j2:
// 2)
chromosome.append(j2 else:
-j1 // 2)
chromosome.append(return chromosome
def parse_integers(string: str) -> List[int]:
return list(map(int, string.split(", ")))
def get_first_key(dictionary: Dict) -> int:
return next(iter(dictionary.keys()))
# Find a single cycle from colored edges
def find_node_cycle(graph: Dict[int, int]) -> List[int]:
int = get_first_key(graph)
start: int = start
current: int] = []
component: List[while graph:
int = graph.pop(current)
next_node:
graph.pop(next_node)int = next_node + 1 if next_node % 2 else next_node - 1
neighbor: if neighbor == start:
return [next_node] + component + [current]
component.extend([current, next_node])= neighbor
current
# find cycles in colored edges
# to do this, we first make each edge "undirected"
def find_node_cycles(graph: Dict[int, int]) -> Iterator[List[int]]:
= copy(graph)
graph for k, v in list(graph.items()):
= k
graph[v] while graph:
yield find_node_cycle(graph)
def graph_to_genome(genome_graph: Dict[int, int]) -> List[List[int]]:
int]] = []
genome: List[List[for nodes in find_node_cycles(genome_graph):
genome.append(cycle_to_chromosome(nodes))return genome
def parse_edge_string(edge_string: str) -> Dict[int, int]:
int, int] = {}
graph: Dict[for x in re.findall(r"\((.+?)\)", edge_string):
= parse_integers(x)
a, b = b
graph[a] return graph
str = "(2, 4), (3, 6), (5, 1), (7, 9), (10, 12), (11, 8)"
sample_input:
str = sample_input.strip()
edge_string: int, int] = parse_edge_string(edge_string)
genome_graph: Dict[print(*[format_chromosome(x) for x in graph_to_genome(genome_graph)], sep="")
71 Implement 2-BreakOnGenomeGraph
2-Break On Genome Graph Problem. Solve the 2-Break On Genome Graph Problem.
Given: The colored edges of a genome graph GenomeGraph, followed by indices i, i’, j, and j’.
Return: The colored edges of the genome graph resulting from applying the 2-break operation.
71.1 Sample Dataset
(2, 4), (3, 8), (7, 5), (6, 1)
1, 6, 3, 8
71.2 Sample Output
(2, 4), (3, 1), (7, 5), (6, 8)
71.3 Solution
import re
from typing import Dict, List, Tuple
from copy import copy
def parse_edge_string(edge_string: str) -> Dict[int, int]:
int, int] = {}
graph: Dict[for match in re.findall(r"\((.+?)\)", edge_string):
= parse_integers(match)
node1, node2 = node2
graph[node1] return graph
def parse_integers(string: str) -> List[int]:
return list(map(int, string.split(", ")))
def remove_edge(graph: Dict[int, int], edge: Tuple[int, int]) -> None:
if edge[0] in graph:
0])
graph.pop(edge[else:
1])
graph.pop(edge[
def two_break_on_genome_graph(graph: Dict[int, int], i: int, ip: int, j: int, jp: int) -> Dict[int, int]:
int, int] = copy(graph)
new_graph: Dict[for edge in [(i, ip), (j, jp)]:
remove_edge(new_graph, edge)= j
new_graph[i] = jp
new_graph[ip] return new_graph
str = """
sample_input: (2, 4), (3, 8), (7, 5), (6, 1)
1, 6, 3, 8
"""
= sample_input.strip().split("\n")
edge_string, break_points int, int] = parse_edge_string(edge_string)
graph: Dict[int] = parse_integers(break_points)
break_points_list: List[int, int] = two_break_on_genome_graph(graph, *break_points_list)
new_graph: Dict[int, int]] = [(k, v) for k, v in new_graph.items()]
edges: List[Tuple[print(*edges, sep=", ")
72 Implement 2-BreakOnGenome
Implement 2-BreakOnGenome. Solve the 2-Break On Genome Graph Problem.
Given: A genome P, followed by indices i, i’, j, and j’.
Return: The genome P’ resulting from applying the 2-break operation.
72.1 Sample Dataset
(+1 -2 -4 +3)
1, 6, 3, 8
72.2 Sample Output
(+1 -2) (-4 +3)
72.3 Solution
from typing import List, Dict, Tuple, Iterator
from copy import copy
def parse_permutation(s: str) -> List[int]:
return list(map(int, s[1:-1].split()))
def format_permutation(chromosome: List[int]) -> str:
return "(" + " ".join([f"{gene:+}" for gene in chromosome]) + ")"
def chromosome_to_cycle(chromosome: List[int]) -> List[int]:
int] = []
nodes: List[for gene in chromosome:
if gene > 0:
2 * gene - 1, 2 * gene])
nodes.extend([else:
-2 * gene, -2 * gene - 1])
nodes.extend([return nodes
def colored_edges(genome: List[List[int]]) -> Dict[int, int]:
int, int] = {}
graph: Dict[for chromosome in genome:
= chromosome_to_cycle(chromosome)
nodes for j in range(len(chromosome)):
= 2 * j + 1
start_index = (2 * j + 2) % len(nodes)
end_index = nodes[end_index]
graph[nodes[start_index]] return graph
def remove_edge(graph: Dict[int, int], edge: Tuple[int, int]) -> None:
if edge[0] in graph:
0])
graph.pop(edge[else:
1])
graph.pop(edge[
def two_break_on_genome_graph(graph: Dict[int, int], i: int, ip: int, j: int, jp: int) -> Dict[int, int]:
= copy(graph)
new_graph for edge in [(i, ip), (j, jp)]:
remove_edge(new_graph, edge)= j
new_graph[i] = jp
new_graph[ip] return new_graph
def get_first_key(dictionary: Dict) -> int:
return next(iter(dictionary.keys()))
def find_node_cycle(graph: Dict[int, int]) -> List[int]:
= get_first_key(graph)
start = start
current int] = []
component: List[while graph:
= graph.pop(current)
next_node
graph.pop(next_node)= next_node + 1 if next_node % 2 else next_node - 1
neighbor if neighbor == start:
return [next_node] + component + [current]
component.extend([current, next_node])= neighbor
current
def find_node_cycles(graph: Dict[int, int]) -> Iterator[List[int]]:
= copy(graph)
graph for k, v in list(graph.items()):
= k
graph[v] while graph:
yield find_node_cycle(graph)
def cycle_to_chromosome(cycle: List[int]) -> List[int]:
int] = []
chromosome: List[for j1, j2 in zip(cycle[::2], cycle[1::2]):
if j1 < j2:
// 2)
chromosome.append(j2 else:
-j1 // 2)
chromosome.append(return chromosome
def graph_to_genome(genome_graph: Dict[int, int]) -> List[List[int]]:
int]] = []
genome: List[List[for nodes in find_node_cycles(genome_graph):
genome.append(cycle_to_chromosome(nodes))return genome
def parse_integers(x: str) -> List[int]:
return list(map(int, x.split(", ")))
def two_break_on_genome(chromosome: List[int], i: int, ip: int, j: int, jp: int) -> List[List[int]]:
= colored_edges([chromosome])
genome_graph = two_break_on_genome_graph(genome_graph, i, ip, j, jp)
genome_graph return graph_to_genome(genome_graph)
str = """
sample_input: (+1 -2 -4 +3)
1, 6, 3, 8
"""
= sample_input.strip().split("\n")
genome_str, indices_str = parse_permutation(genome_str)
genome = two_break_on_genome(genome, *parse_integers(indices_str))
new_genome print(*[format_permutation(x) for x in new_genome])
73 Compute Distances Between Leaves
Distance Between Leaves Problem. Compute the distances between leaves in a weighted tree.
Given: An integer n followed by the adjacency list of a weighted tree with n leaves.
Return: A space-separated n x n (*d__i, j), where d__i, j is the length of the path between leaves i and j*.
73.1 Sample Dataset
4
0->4:11
1->4:2
2->5:6
3->5:7
4->0:11
4->1:2
4->5:4
5->4:4
5->3:7
5->2:6
73.2 Sample Output
0 13 21 22
13 0 12 13
21 12 0 13
22 13 13 0
73.3 Solution
from re import split
from collections import defaultdict
from math import inf
from heapq import heappush, heappop
from typing import Dict, List, Set, Tuple
def get_all_nodes(graph: Dict[int, List[Dict[str, int]]]) -> Set[int]:
= set(graph.keys())
source_nodes = {edge["n"] for edges in graph.values() for edge in edges}
destination_nodes return source_nodes | destination_nodes
# Dijkstra's algorithm to find distance from start to all other nodes
# Assumes nodes are integers starting at 0!
def dijkstra(start: int, graph: Dict[int, List[Dict[str, int]]]) -> List[float]:
= [inf for _ in range(len(get_all_nodes(graph)))]
distances = 0
distances[start] float, int]] = []
priority_queue: List[Tuple[0, start))
heappush(priority_queue, (int] = set()
processed_nodes: Set[
while priority_queue:
= heappop(priority_queue)[1]
current_node
processed_nodes.add(current_node)for neighbor in graph[current_node]:
if neighbor["n"] not in processed_nodes:
"n"]] = min(distances[current_node] + neighbor["w"], distances[neighbor["n"]])
distances[neighbor["n"]], neighbor["n"]))
heappush(priority_queue, (distances[neighbor[
return distances
def parse_weighted_graph(edges: List[str]) -> Dict[int, List[Dict[str, int]]]:
int, List[Dict[str, int]]] = defaultdict(list)
graph: Dict[
for edge in edges:
= map(int, split(r"\D+", edge))
from_node, to_node, weight "n": to_node, "w": weight})
graph[from_node].append({
return graph
= """
sample_input 4
0->4:11
1->4:2
2->5:6
3->5:7
4->0:11
4->1:2
4->5:4
5->4:4
5->3:7
5->2:6
"""
*edges = sample_input.strip().split("\n")
num_leaves, = int(num_leaves)
num_leaves = parse_weighted_graph(edges)
graph for i in range(num_leaves):
print(*dijkstra(i, graph)[:num_leaves])
74 Compute Limb Lengths in a Tree
Limb Length Problem. Find the limb length for a leaf in a tree.
Given: An integer n, followed by an integer j between 0 and n - 1, followed by a space-separated additive distance matrix D (whose elements are integers).
Return: The limb length of the leaf in Tree(D) corresponding to row j of this distance matrix (use 0-based indexing).
74.1 Sample Dataset
4
1
0 13 21 22
13 0 12 13
21 12 0 13
22 13 13 0
74.2 Sample Output
2
74.3 Solution
from re import split
from collections import defaultdict
from math import inf
from heapq import heappush, heappop
from typing import Dict, List, Set, Tuple
def get_all_nodes(graph: Dict[int, List[Dict[str, int]]]) -> Set[int]:
int] = set(graph.keys())
source_nodes: Set[int] = {edge["to"] for edges in graph.values() for edge in edges}
destination_nodes: Set[return source_nodes | destination_nodes
# Dijkstra's algorithm to find distance from start to all other nodes
# Assumes nodes are integers starting at 0!
def dijkstra(start: int, graph: Dict[int, List[Dict[str, int]]]) -> List[float]:
float] = [inf for _ in range(len(get_all_nodes(graph)))]
distances: List[= 0
distances[start] float, int]] = []
priority_queue: List[Tuple[0, start))
heappush(priority_queue, (int] = set()
processed_nodes: Set[
while priority_queue:
= heappop(priority_queue)
current_distance, current_node if current_node in processed_nodes:
continue
processed_nodes.add(current_node)for neighbor in graph[current_node]:
int = neighbor["to"]
neighbor_node: int = neighbor["weight"]
edge_weight: if neighbor_node not in processed_nodes:
float = current_distance + edge_weight
new_distance: if new_distance < distances[neighbor_node]:
= new_distance
distances[neighbor_node]
heappush(priority_queue, (new_distance, neighbor_node))
return distances
def parse_weighted_graph(edges: List[str]) -> Dict[int, List[Dict[str, int]]]:
int, List[Dict[str, int]]] = defaultdict(list)
graph: Dict[
for edge in edges:
= map(int, split(r"\D+", edge))
from_node, to_node, weight "to": to_node, "weight": weight})
graph[from_node].append({
return graph
str = """
sample_input: 4
0->4:11
1->4:2
2->5:6
3->5:7
4->0:11
4->1:2
4->5:4
5->4:4
5->3:7
5->2:6
"""
*edges = sample_input.strip().split("\n")
num_leaves, int = int(num_leaves)
num_leaves: int, List[Dict[str, int]]] = parse_weighted_graph(edges)
graph: Dict[for i in range(num_leaves):
print(*dijkstra(i, graph)[:num_leaves])
75 Implement AdditivePhylogeny
Additive Phylogeny Problem. Construct the simple tree fitting an additive matrix.
Given: n and a tab-delimited n x n additive matrix.
Return: A weighted adjacency list for the simple tree fitting this matrix.
Note on formatting: The adjacency list must have consecutive integer node labels starting from 0. The n leaves must be labeled 0, 1, …, n-1 in order of their appearance in the distance matrix. Labels for internal nodes may be labeled in any order but must start from n and increase consecutively.
75.1 Sample Dataset
4
0 13 21 22
13 0 12 13
21 12 0 13
22 13 13 0
75.2 Sample Output
0->4:11
1->4:2
2->5:6
3->5:7
4->0:11
4->1:2
4->5:4
5->2:6
5->3:7
5->4:4
75.3 Solution
from collections import defaultdict
from typing import List, Dict, Set, Tuple, Generator
# Type aliases for clarity
= List[List[int]]
DistanceMatrix = Dict[int, List[Dict[str, int]]]
Graph
def parse_distance_matrix(lines: List[str]) -> DistanceMatrix:
"""Parse integer matrix from set of lines"""
return [[int(x) for x in line.split()] for line in lines]
def calculate_limb_length(distance_matrix: DistanceMatrix, j: int) -> int:
"""Calculate limb length j for distance matrix"""
= float('inf')
min_limb_length = len(distance_matrix)
n for k in range(n):
for i in range(n):
if j != k and i != j:
= (distance_matrix[i][j] + distance_matrix[j][k] - distance_matrix[i][k]) // 2
limb_length = min(limb_length, min_limb_length)
min_limb_length return min_limb_length
def get_all_nodes(graph: Graph) -> Set[int]:
"""Get all nodes in the graph"""
= set(graph.keys())
source_nodes = {edge["n"] for edges in graph.values() for edge in edges}
target_nodes return source_nodes | target_nodes
def find_path(graph: Graph, path: List[Tuple[int, int]], target: int) -> Generator[List[Tuple[int, int]], None, None]:
"""Search tree, returning route to target and cumulative distance"""
if target in [x[0] for x in path]:
yield path
= path[-1]
current_node, current_distance if current_node in graph:
for edge in graph[current_node]:
if edge["n"] not in [x[0] for x in path]:
= path + [(edge["n"], current_distance + edge["w"])]
new_path yield from find_path(graph, new_path, target)
def find_leaves(distance_matrix: DistanceMatrix, n: int) -> Tuple[int, int, int]:
"""Find three leaves i, n, k such that D[i][k] = D[i][n] + D[n][k]"""
for i in range(len(distance_matrix)):
for k in range(i + 1, len(distance_matrix)):
if distance_matrix[i][k] == distance_matrix[i][n] + distance_matrix[n][k]:
return i, n, k
raise ValueError("No suitable leaves found")
def add_node(tree: Graph, i: int, k: int, x: int, n: int) -> Graph:
"""Add node in graph between i and k, distance x from i, labelled n"""
= list(find_path(tree, [(i, 0)], k))[0]
path for p, node in enumerate(path):
if node[1] > x:
break
= p - 1
p = path[p]
i, d1 = path[p + 1]
k, d2
# Delete old edge and add new edges
= [edge for edge in tree[i] if edge["n"] != k]
tree[i] "n": n, "w": x - d1})
tree[i].append({"n": k, "w": d2 - x})
tree.setdefault(n, []).append({return tree
def additive_phylogeny(distance_matrix: DistanceMatrix, m: int) -> Graph:
= len(distance_matrix) - 1
n if len(distance_matrix) == 2:
= defaultdict(list)
graph 0].append({"n": 1, "w": distance_matrix[0][1]})
graph[return graph
= calculate_limb_length(distance_matrix, n)
limb_length for j in range(len(distance_matrix)):
if j != n:
-= limb_length
distance_matrix[j][n] = distance_matrix[j][n]
distance_matrix[n][j]
= find_leaves(distance_matrix, n)
i, n, k = distance_matrix[i][n]
x
# Remove row n and column n from distance_matrix
= [row[:n] + row[n+1:] for row in distance_matrix[:n] + distance_matrix[n+1:]]
reduced_matrix
= additive_phylogeny(reduced_matrix, m)
tree
# Label for new internal node
= max(max(get_all_nodes(tree)), m - 1) + 1
v
# Break an internal edge adding a new node (possibly) and add the new leaf node
= add_node(tree, i, k, x, v)
tree "n": n, "w": limb_length})
tree.setdefault(v, []).append({return tree
def get_edges(graph: Graph) -> List[str]:
= []
edges for k in sorted(graph):
for v in graph[k]:
f"{k}->{v['n']}:{v['w']}")
edges.append(f"{v['n']}->{k}:{v['w']}")
edges.append(return sorted(edges)
# Sample usage
= """
sample_input 4
0 13 21 22
13 0 12 13
21 12 0 13
22 13 13 0
"""
*distance_matrix_str = sample_input.strip().split("\n")
n, = parse_distance_matrix(distance_matrix_str)
distance_matrix = additive_phylogeny(distance_matrix, int(n))
graph for edge in get_edges(graph):
print(edge)
76 Implement UPGMA
UPGMA Problem. Construct the ultrametric tree resulting from UPGMA.
Given: An integer n followed by a space-delimited n x n distance matrix.
Return: An adjacency list for the ultrametric tree output by UPGMA. Weights should be accurate to three decimal places.
Note on formatting: The adjacency list must have consecutive integer node labels starting from 0. The n leaves must be labeled 0, 1, …, n-1 in order of their appearance in the distance matrix. Labels for internal nodes may be labeled in any order but must start from n and increase consecutively.
76.1 Sample Dataset
4
0 20 17 11
20 0 20 13
17 20 0 10
11 13 10 0
76.2 Sample Output
0->5:7.000
1->6:8.833
2->4:5.000
3->4:5.000
4->2:5.000
4->3:5.000
4->5:2.000
5->0:7.000
5->4:2.000
5->6:1.833
6->5:1.833
6->1:8.833
76.3 Solution
from typing import List
def parse_matrix(lines: List[str]) -> List[List[int]]:
"""Parse integer matrix from set of lines"""
return [[int(x) for x in line.split()] for line in lines]
import numpy as np
from collections import defaultdict
from typing import Dict, List, Tuple
def as_edges(graph: Dict[int, List[Dict[str, float]]]) -> List[str]:
= []
edges for source in sorted(graph):
for target in graph[source]:
f"{source}->{target['n']}:{target['w']:f}")
edges.append(f"{target['n']}->{source}:{target['w']:f}")
edges.append(return sorted(edges)
def closest(distance_matrix: np.ndarray) -> Tuple[int, int]:
"""Find (first) minimum off diagonal index in an array"""
= np.copy(distance_matrix)
distance_matrix max() + 1)
np.fill_diagonal(distance_matrix, distance_matrix.return divmod(distance_matrix.argmin(), distance_matrix.shape[1])
def average_ind(distance_matrix: np.ndarray, i: int, j: int, size_i: int, size_j: int) -> np.ndarray:
"""Replace the ith row/col with the average of the ith and jth and remove the jth"""
= np.copy(distance_matrix)
distance_matrix = (distance_matrix[i, :] * size_i + distance_matrix[j, :] * size_j) / (size_i + size_j)
average = average
distance_matrix[i, :] = average
distance_matrix[:, i] = np.delete(distance_matrix, j, 0)
distance_matrix = np.delete(distance_matrix, j, 1)
distance_matrix 0)
np.fill_diagonal(distance_matrix, return distance_matrix
def upgma(distance_matrix: np.ndarray, num_clusters: int) -> Dict[int, List[Dict[str, float]]]:
= list(range(0, num_clusters))
clusters int, float] = defaultdict(lambda: 0) # the "age" of a node
ages: Dict[int, int] = defaultdict(lambda: 1) # the number of descendants of a node
size: Dict[int, List[Dict[str, float]]] = {} # the graph / tree we're building
tree: Dict[= num_clusters # a label for internal nodes as we add them
node_label
while len(clusters) > 1:
= closest(distance_matrix)
i, j = clusters[i], clusters[j]
a, b
= [
tree[node_label] "n": a, "w": distance_matrix[i, j] / 2 - ages[a]},
{"n": b, "w": distance_matrix[i, j] / 2 - ages[b]},
{
]= size[a] + size[b]
size[node_label] = distance_matrix[i, j] / 2
ages[node_label] = node_label
clusters[i] del clusters[j]
= average_ind(distance_matrix, *closest(distance_matrix), size[a], size[b])
distance_matrix += 1
node_label
return tree
= """
sample_input 4
0 20 17 11
20 0 20 13
17 20 0 10
11 13 10 0
"""
*distance_data = sample_input.strip().split("\n")
num_clusters, = np.array(parse_matrix(distance_data), float)
distance_matrix = upgma(distance_matrix, int(num_clusters))
graph for edge in as_edges(graph):
print(edge)
77 Implement the Neighbor Joining Algorithm
Neighbor Joining Problem. Construct the tree resulting from applying the neighbor-joining algorithm to a distance matrix.
Given: An integer n, followed by a space-separated n x n distance matrix.
Return: An adjacency list for the tree resulting from applying the neighbor-joining algorithm. Edge-weights should be accurate to two decimal places (they are provided to three decimal places in the sample output below).
Note on formatting: The adjacency list must have consecutive integer node labels starting from 0. The n leaves must be labeled 0, 1, …, n-1 in order of their appearance in the distance matrix. Labels for internal nodes may be labeled in any order but must start from n and increase consecutively.
77.1 Sample Dataset
4
0 23 27 20
23 0 30 28
27 30 0 30
20 28 30 0
77.2 Sample Output
0->4:8.000
1->5:13.500
2->5:16.500
3->4:12.000
4->5:2.000
4->0:8.000
4->3:12.000
5->1:13.500
5->2:16.500
5->4:2.000
77.3 Solution
from collections import defaultdict
from typing import List, Dict, Union
import numpy as np
def parse_matrix(lines: List[str]) -> List[List[int]]:
"""Parse integer matrix from set of lines"""
return [[int(x) for x in line.split()] for line in lines]
def format_edges(tree: Dict[int, List[Dict[str, Union[int, float]]]]) -> List[str]:
= []
formatted_edges for node in sorted(tree):
for neighbor in tree[node]:
f"{node}->{neighbor['node']}:{neighbor['weight']:f}")
formatted_edges.append(f"{neighbor['node']}->{node}:{neighbor['weight']:f}")
formatted_edges.append(return sorted(formatted_edges)
def find_closest_pair(distance_matrix: np.ndarray) -> tuple[int, int]:
"""Find (first) minimum off-diagonal index in an array"""
= np.copy(distance_matrix)
temp_matrix max() + 1)
np.fill_diagonal(temp_matrix, temp_matrix.return divmod(temp_matrix.argmin(), temp_matrix.shape[1])
def calculate_neighbor_joining_matrix(distance_matrix: np.ndarray, num_nodes: int) -> np.ndarray:
= np.copy(distance_matrix)
nj_matrix for i in range(len(distance_matrix)):
for j in range(len(distance_matrix)):
if i != j:
= (num_nodes - 2) * distance_matrix[i, j] - sum(distance_matrix[i, :]) - sum(distance_matrix[j, :])
nj_matrix[i, j] return nj_matrix
def neighbor_joining(distance_matrix: np.ndarray, num_nodes: int, labels: List[int] = None) -> Dict[int, List[Dict[str, Union[int, float]]]]:
if not labels:
= list(range(num_nodes))
labels
if num_nodes == 2:
= defaultdict(list)
tree 0]].append({"node": labels[1], "weight": distance_matrix[0][1]})
tree[labels[return tree
= calculate_neighbor_joining_matrix(distance_matrix, num_nodes)
nj_matrix = find_closest_pair(nj_matrix)
i, j = (sum(distance_matrix[i, :]) - sum(distance_matrix[j, :])) / (num_nodes - 2)
delta = (distance_matrix[i, j] + delta) / 2
limb_i = (distance_matrix[i, j] - delta) / 2
limb_j
= labels[i]
label_i = labels[j]
label_j
= np.append(distance_matrix, np.zeros((1, len(distance_matrix))), axis=0)
distance_matrix = np.append(distance_matrix, np.zeros((len(distance_matrix), 1)), axis=1)
distance_matrix = labels + [max(labels) + 1]
labels
for k in range(num_nodes):
= (distance_matrix[k, i] + distance_matrix[k, j] - distance_matrix[i, j]) / 2
distance_matrix[k, num_nodes] = (distance_matrix[k, i] + distance_matrix[k, j] - distance_matrix[i, j]) / 2
distance_matrix[num_nodes, k] for x in [j, i]:
= np.delete(distance_matrix, x, 0)
distance_matrix = np.delete(distance_matrix, x, 1)
distance_matrix del labels[x]
= neighbor_joining(distance_matrix, num_nodes - 1, labels)
tree
-1]].append({"node": label_i, "weight": limb_i})
tree[labels[-1]].append({"node": label_j, "weight": limb_j})
tree[labels[return tree
= """
sample_input 4
0 23 27 20
23 0 30 28
27 30 0 30
20 28 30 0
"""
*distance_matrix = sample_input.strip().split("\n")
num_nodes, = np.array(parse_matrix(distance_matrix), float)
distance_matrix = neighbor_joining(distance_matrix, int(num_nodes))
tree for edge in format_edges(tree):
print(edge)
78 Implement SmallParsimony
Small Parsimony Problem. Find the most parsimonious labeling of the internal nodes of a rooted tree.
Given: An integer n followed by an adjacency list for a rooted binary tree with n leaves labeled by DNA strings.
Return: The minimum parsimony score of this tree, followed by the adjacency list of the tree corresponding to labeling internal nodes by DNA strings in order to minimize the parsimony score of the tree.
Note: Remember to run SmallParsimony on each individual index of the strings at the leaves of the tree.
78.1 Sample Dataset
4
4->CAAATCCC
4->ATTGCGAC
5->CTGCGCTG
5->ATGGACGA
6->4
6->5
78.2 Sample Output
16
ATAGACAA->ATAGACAC:1
ATAGACAC->ATAGACAA:1
ATAGACAC->CAAATCCC:5
CAAATCCC->ATAGACAC:5
ATAGACAC->ATTGCGAC:3
ATTGCGAC->ATAGACAC:3
ATAGACAA->ATGGACAA:1
ATGGACAA->ATAGACAA:1
ATGGACAA->CTGCGCTG:5
CTGCGCTG->ATGGACAA:5
ATGGACAA->ATGGACGA:1
ATGGACGA->ATGGACAA:1
78.3 Solution
79 Adapt SmallParsimony to Unrooted Trees
Small Parsimony in an Unrooted Tree Problem. Find the most parsimonious labeling of the internal nodes in an unrooted tree.
Given: An unrooted binary tree with each leaf labeled by a string of length m.
Return: A labeling of all other nodes of the tree by strings of length m that minimizes the parsimony score of the tree.
Note on formatting: Your internal node labelings may differ from the sample provided.
79.1 Sample Dataset
4
TCGGCCAA->4
4->TCGGCCAA
CCTGGCTG->4
4->CCTGGCTG
CACAGGAT->5
5->CACAGGAT
TGAGTACC->5
5->TGAGTACC
4->5
5->4
79.2 Sample Output
17
TCGGCCAA->CCAGGCAC:4
CCTGGCTG->CCAGGCAC:3
TGAGTACC->CAAGGAAC:4
CCAGGCAC->CCTGGCTG:3
CCAGGCAC->CAAGGAAC:2
CCAGGCAC->TCGGCCAA:4
CACAGGAT->CAAGGAAC:4
CAAGGAAC->CACAGGAT:4
CAAGGAAC->TGAGTACC:4
CAAGGAAC->CCAGGCAC:2
79.3 Solution
from collections import defaultdict
from math import inf
from typing import Dict, List, Set, Tuple
# return all nodes of a simple graph
def nodes(graph: Dict[int, List[int]]) -> Set[int]:
= list(graph.keys())
s = [y for v in graph.values() for y in v]
e return set(s) | set(e)
# return all leaves of a simple graph
def leaves(graph: Dict[int, List[int]]) -> Set[int]:
return set(y for v in list(graph.values()) for y in v if not graph[y])
# return all root node of a simple graph
def root(graph: Dict[int, List[int]]) -> int:
= reverse_graph(graph)
rev = list(nodes(graph))[0]
node while node in rev:
= rev[node]
node return node
# reverse a simple graph (child points to parent)
def reverse_graph(graph: Dict[int, List[int]]) -> Dict[int, int]:
int, int] = {}
rev: Dict[for node in graph:
for child in graph[node]:
= node
rev[child] return rev
def parse_input(input_string: str) -> Tuple[Dict[int, str], Dict[int, List[int]]]:
= input_string.strip().split('\n')
lines = int(lines[0])
n int, str] = {}
seqs: Dict[int, List[int]] = defaultdict(list)
graph: Dict[for i, edge in enumerate(lines[1:n+1]):
= edge.rstrip().split("->")
f, t int(f)].append(i)
graph[= t
seqs[i] for edge in lines[n+1:]:
= edge.rstrip().split("->")
f, t int(f)].append(int(t))
graph[return seqs, graph
# print (bidirectional) edges
def print_edges(graph: Dict[int, List[int]], seqs: Dict[int, str], node: int) -> None:
for child in graph[node]:
if node in seqs and child in seqs:
= sum(a != b for a, b in zip(seqs[node], seqs[child]))
dist print(f"{seqs[node]}->{seqs[child]}:{dist}")
print(f"{seqs[child]}->{seqs[node]}:{dist}")
print_edges(graph, seqs, child)
def extract_position(graph: Dict[int, List[int]], seqs: Dict[int, str], pos: int) -> Dict[int, str]:
int, str] = {}
chars: Dict[for n in nodes(graph) - leaves(graph):
= ""
chars[n] for leaf in leaves(graph):
= seqs[leaf][pos]
chars[leaf] return chars
def traceback(skp: Dict[int, List[Dict[int, int]]], node: int, ind: int) -> Dict[int, str]:
= ["A", "C", "T", "G"]
bases int, str] = {}
chars: Dict[= bases[ind]
chars[node] for k, v in skp[node][ind].items():
if k in skp:
= chars | traceback(skp, k, v)
chars return chars
def small_parsimony(graph: Dict[int, List[int]], chars: Dict[int, str]) -> Tuple[int, Dict[int, str]]:
= ["A", "C", "T", "G"]
bases int, List[int]] = {} # minimum parsimony score of the subtree over possible labels
sk: Dict[int, List[Dict[int, int]]] = {} # pointer to selected base for each child over possible labels
skp: Dict[= nodes(graph)
to_process
# initialise leaves
for leaf in leaves(graph):
= [0 if chars[leaf] == c else inf for c in bases]
sk[leaf]
to_process.remove(leaf)
# iterate over available nodes till all are processed
while to_process:
for n in list(to_process):
if all(v in sk for v in graph[n]):
= [], []
sk[n], skp[n] for k in bases:
= 0
tot int, int] = {}
ptr: Dict[for d, sk_child in [(d, sk[d]) for d in graph[n]]:
= []
score for i, c in enumerate(bases):
+= [sk_child[i] + (0 if c == k else 1)]
score += min(score)
tot = score.index(min(score))
ptr[d] += [ptr]
skp[n] += [tot]
sk[n]
to_process.remove(n)
# Recover sequence
= root(graph)
node = min(sk[node])
score return score, traceback(skp, node, sk[node].index(score))
def ba6f(graph: Dict[int, List[int]], seqs: Dict[int, str]) -> Tuple[int, Dict[int, str]]:
# initialise sequences
for n in nodes(graph) - leaves(graph):
= ""
seqs[n]
= 0
total_score for pos in range(len(next(iter(seqs.values())))):
= extract_position(graph, seqs, pos)
chars = small_parsimony(graph, chars)
score, tbchars += score
total_score for k, v in tbchars.items():
+= v
seqs[k]
return total_score, seqs
= """
sample_input 4
4->CAAATCCC
4->ATTGCGAC
5->CTGCGCTG
5->ATGGACGA
6->4
6->5
"""
= parse_input(sample_input)
seqs, graph = ba6f(graph, seqs)
total_score, seqs print(total_score)
print_edges(graph, seqs, root(graph))
80 Adapt SmallParsimony to Unrooted Trees
Small Parsimony in an Unrooted Tree Problem. Find the most parsimonious labeling of the internal nodes in an unrooted tree.
Given: An unrooted binary tree with each leaf labeled by a string of length m.
Return: A labeling of all other nodes of the tree by strings of length m that minimizes the parsimony score of the tree.
Note on formatting: Your internal node labelings may differ from the sample provided.
80.1 Sample Dataset
4
TCGGCCAA->4
4->TCGGCCAA
CCTGGCTG->4
4->CCTGGCTG
CACAGGAT->5
5->CACAGGAT
TGAGTACC->5
5->TGAGTACC
4->5
5->4
80.2 Sample Output
17
TCGGCCAA->CCAGGCAC:4
CCTGGCTG->CCAGGCAC:3
TGAGTACC->CAAGGAAC:4
CCAGGCAC->CCTGGCTG:3
CCAGGCAC->CAAGGAAC:2
CCAGGCAC->TCGGCCAA:4
CACAGGAT->CAAGGAAC:4
CAAGGAAC->CACAGGAT:4
CAAGGAAC->TGAGTACC:4
CAAGGAAC->CCAGGCAC:2
80.3 Solution
from collections import defaultdict
from typing import Dict, List, Set, Tuple, TextIO
import io
from math import inf
def nodes(graph: Dict[int, List[int]]) -> Set[int]:
= list(graph.keys())
s = [y for v in graph.values() for y in v]
e return set(s) | set(e)
def leaves(graph: Dict[int, List[int]]) -> Set[int]:
return set(y for v in list(graph.values()) for y in v if not graph[y])
def root(graph: Dict[int, List[int]]) -> int:
= reverse_graph(graph)
rev = list(nodes(graph))[0]
node while node in rev:
= rev[node]
node return node
def get_all_nodes(graph: Dict[int, List[int]]) -> Set[int]:
= set(graph.keys())
source_nodes = {target for targets in graph.values() for target in targets}
target_nodes return source_nodes | target_nodes
def reverse_graph(graph: Dict[int, List[int]]) -> Dict[int, int]:
int, int] = {}
reversed_graph: Dict[for parent, children in graph.items():
for child in children:
= parent
reversed_graph[child] return reversed_graph
def get_leaves(graph: Dict[int, List[int]]) -> Set[int]:
return {child for children in graph.values() for child in children if not graph[child]}
def find_root(graph: Dict[int, List[int]]) -> int:
= reverse_graph(graph)
reversed_graph = next(iter(get_all_nodes(graph)))
node while node in reversed_graph:
= reversed_graph[node]
node return node
def print_edges(graph: Dict[int, List[int]], sequences: Dict[int, str], node: int) -> None:
for child in graph[node]:
= sum(a != b for a, b in zip(sequences[node], sequences[child]))
distance print(f"{sequences[node]}->{sequences[child]}:{distance}")
print(f"{sequences[child]}->{sequences[node]}:{distance}")
print_edges(graph, sequences, child)
def root_tree(graph: Dict[int, List[int]], node: int) -> None:
for child in graph[node]:
if node in graph[child]:
graph[child].remove(node)
root_tree(graph, child)
def parse_input(handle: TextIO) -> Tuple[Dict[int, str], Dict[int, List[int]]]:
= int(next(handle))
n int, str] = {}
sequences: Dict[int, List[int]] = defaultdict(list)
graph: Dict[for i in range(n):
next(handle)
= next(handle).rstrip().split("->")
from_node, to_seq int(from_node)].append(i)
graph[= to_seq
sequences[i]
= handle.readlines()
lines = int(lines[0].rstrip().split("->")[0])
root for edge in lines:
= edge.rstrip().split("->")
from_node, to_node int(from_node)].append(int(to_node))
graph[
root_tree(graph, root)return sequences, graph
def extract_position(graph: Dict[int, List[int]], sequences: Dict[int, str], pos: int) -> Dict[int, str]:
int, str] = {}
chars: Dict[for n in get_all_nodes(graph) - get_leaves(graph):
= ""
chars[n] for leaf in get_leaves(graph):
= sequences[leaf][pos]
chars[leaf] return chars
def ba6f(graph: Dict[int, List[int]], sequences: Dict[int, str]) -> Tuple[int, Dict[int, str]]:
for n in get_all_nodes(graph) - get_leaves(graph):
= ""
sequences[n]
= 0
total_score for pos in range(len(next(iter(sequences.values())))):
= extract_position(graph, sequences, pos)
chars = small_parsimony(graph, chars)
score, tbchars += score
total_score for k, v in tbchars.items():
+= v
sequences[k]
return total_score, sequences
def small_parsimony(graph: Dict[int, List[int]], chars: Dict[int, str]) -> Tuple[int, Dict[int, str]]:
= ["A", "C", "T", "G"]
bases int, List[int]] = {}
sk: Dict[int, List[Dict[int, int]]] = {}
skp: Dict[= nodes(graph)
to_process
for leaf in leaves(graph):
= [0 if chars[leaf] == c else inf for c in bases]
sk[leaf]
to_process.remove(leaf)
while to_process:
for n in list(to_process):
if all(v in sk for v in graph[n]):
= [], []
sk[n], skp[n] for k in bases:
= 0
tot int, int] = {}
ptr: Dict[for d, sk_child in [(d, sk[d]) for d in graph[n]]:
= []
score for i, c in enumerate(bases):
+= [sk_child[i] + (0 if c == k else 1)]
score += min(score)
tot = score.index(min(score))
ptr[d] += [ptr]
skp[n] += [tot]
sk[n]
to_process.remove(n)
= root(graph)
node = min(sk[node])
score return score, traceback(skp, node, sk[node].index(score))
def traceback(skp: Dict[int, List[Dict[int, int]]], node: int, ind: int) -> Dict[int, str]:
= ["A", "C", "T", "G"]
bases int, str] = {}
chars: Dict[= bases[ind]
chars[node] for k, v in skp[node][ind].items():
if k in skp:
= chars | traceback(skp, k, v)
chars return chars
= """
sample_input 4
TCGGCCAA->4
4->TCGGCCAA
CCTGGCTG->4
4->CCTGGCTG
CACAGGAT->5
5->CACAGGAT
TGAGTACC->5
5->TGAGTACC
4->5
5->4
"""
= parse_input(io.StringIO(sample_input.strip()))
sequences, graph = ba6f(graph, sequences)
total_score, sequences print(total_score)
print_edges(graph, sequences, find_root(graph))
81 Implement FarthestFirstTraversal
Given: Integers k and m followed by a set of points Data in m-dimensional space.
Return: A set Centers consisting of k points (centers) resulting from applying FarthestFirstTraversal(Data, k), where the first point from Data is chosen as the first center to initialize the algorithm.
81.1 Sample Dataset
3 2
0.0 0.0
5.0 5.0
0.0 5.0
1.0 1.0
2.0 2.0
3.0 3.0
1.0 2.0
81.2 Sample Output
0.0 0.0
5.0 5.0
0.0 5.0
81.3 Solution
from math import sqrt
from typing import List, Tuple, Iterator, TextIO, TypeVar
import io
= TypeVar('T')
T
def read_types(file_handle: TextIO, data_type: type) -> Iterator[List[T]]:
for line in file_handle:
yield list(map(data_type, line.split()))
def euclidean_distance(point_a: List[float], point_b: List[float]) -> float:
"""Euclidean distance between a pair of n-dimensional points"""
return sqrt(sum((coord_a - coord_b) ** 2 for coord_a, coord_b in zip(point_a, point_b)))
def nearest_center_distance(data_point: List[float], center_points: List[List[float]]) -> float:
"""Euclidean distance from DataPoint to its closest center"""
return min(euclidean_distance(data_point, center) for center in center_points)
def farthest_first_traversal(data_points: List[List[float]], num_centers: int) -> List[List[float]]:
= [data_points[0]]
center_points while len(center_points) < num_centers:
= [(i, nearest_center_distance(point, center_points)) for i, point in enumerate(data_points)]
distances max(distances, key=lambda x: x[1])[0]])
center_points.append(data_points[return center_points
import io
= """
sample_input 3 2
0.0 0.0
5.0 5.0
0.0 5.0
1.0 1.0
2.0 2.0
3.0 3.0
1.0 2.0
"""
= io.StringIO(sample_input.strip())
file_handle = next(read_types(file_handle, int))
num_centers, dimensions = [point for point in read_types(file_handle, float)]
data_points = farthest_first_traversal(data_points, num_centers)
center_points for center in center_points:
print(*center)
82 Compute the Squared Error Distortion
Squared Error Distortion Problem.
Given: Integers k and m, followed by a set of centers Centers and a set of points Data.
Return: The squared error distortion Distortion(Data, Centers).
82.1 Sample Dataset
2 2
2.31 4.55
5.96 9.08
--------
3.42 6.03
6.23 8.25
4.76 1.64
4.47 4.33
3.95 7.61
8.93 2.97
9.74 4.03
1.73 1.28
9.72 5.01
7.27 3.77
82.2 Sample Output
18.246
82.3 Solution
from math import sqrt
import io
from typing import List, Tuple, Iterator, TextIO, TypeVar
= TypeVar('T')
T
def read_types(file_handle: TextIO, data_type: type) -> Iterator[List[T]]:
for line in file_handle:
yield list(map(data_type, line.split()))
def euclidean_distance(point_a: List[float], point_b: List[float]) -> float:
"""Euclidean distance between a pair of n-dimensional points"""
return sqrt(sum((coord_a - coord_b) ** 2 for coord_a, coord_b in zip(point_a, point_b)))
def nearest_center_distance(data_point: List[float], center_points: List[List[float]]) -> float:
"""Euclidean distance from DataPoint to its closest center"""
return min(euclidean_distance(data_point, center) for center in center_points)
def calculate_distortion(data_points: List[List[float]], center_points: List[List[float]]) -> float:
return (1 / len(data_points)) * sum(nearest_center_distance(point, center_points) ** 2 for point in data_points)
= """
sample_input 2 2
2.31 4.55
5.96 9.08
--------
3.42 6.03
6.23 8.25
4.76 1.64
4.47 4.33
3.95 7.61
8.93 2.97
9.74 4.03
1.73 1.28
9.72 5.01
7.27 3.77
"""
= io.StringIO(sample_input.strip())
file_handle = next(read_types(file_handle, int))
num_centers, _ = read_types(file_handle, float)
data_generator = [next(data_generator) for _ in range(num_centers)]
center_points = next(file_handle)
_ = [point for point in data_generator]
data_points print(round(calculate_distortion(data_points, center_points), 3))
83 Implement the Lloyd Algorithm for k-Means Clustering
Implement the Lloyd algorithm.
Given: Integers k and m followed by a set of points Data in m-dimensional space.
Return: A set Centers consisting of k points (centers) resulting from applying the Lloyd algorithm to Data and Centers, where the first k points from Data are selected as the first k centers.
83.1 Sample Dataset
2 2
1.3 1.1
1.3 0.2
0.6 2.8
3.0 3.2
1.2 0.7
1.4 1.6
1.2 1.0
1.2 1.1
0.6 1.5
1.8 2.6
1.2 1.3
1.2 1.0
0.0 1.9
83.2 Sample Output
1.800 2.867
1.060 1.140
83.3 Solution
import numpy as np
import io
from math import sqrt
from typing import List, Iterator, TypeVar, Any
= TypeVar('T')
T
def read_types(file_handle: io.StringIO, data_type: type) -> Iterator[List[T]]:
for line in file_handle:
yield list(map(data_type, line.split()))
def euclidean_distance(point_a: np.ndarray, point_b: np.ndarray) -> float:
"""Euclidean distance between a pair of n-dimensional points"""
return sqrt(sum((x - y) ** 2 for x, y in zip(point_a, point_b)))
def nearest_center_assignment(data_point: np.ndarray, center_points: List[np.ndarray]) -> int:
"""Center index that minimises Euclidean distance to point"""
= [euclidean_distance(data_point, center) for center in center_points]
distances return distances.index(min(distances))
def compute_center(data_points: List[np.ndarray], assignments: List[int], center_index: int) -> np.ndarray:
= [p for p, a in zip(data_points, assignments) if a == center_index]
cluster_points return np.mean(np.array(cluster_points), axis=0) if cluster_points else np.zeros_like(data_points[0])
def k_means(data_points: List[np.ndarray], num_clusters: int, max_iterations: int = 20) -> List[np.ndarray]:
= data_points[:num_clusters]
center_points for _ in range(max_iterations):
= [nearest_center_assignment(point, center_points) for point in data_points]
assignments = [compute_center(data_points, assignments, i) for i in range(num_clusters)]
center_points return center_points
= """
sample_input 2 2
1.3 1.1
1.3 0.2
0.6 2.8
3.0 3.2
1.2 0.7
1.4 1.6
1.2 1.0
1.2 1.1
0.6 1.5
1.8 2.6
1.2 1.3
1.2 1.0
0.0 1.9
"""
= io.StringIO(sample_input.strip())
file_handle = next(read_types(file_handle, int))
num_clusters, dimensions = [np.array(point) for point in read_types(file_handle, float)]
data_points for center in k_means(data_points, num_clusters):
print(*[f"{coord:f}" for coord in center])
84 Implement the Soft k-Means Clustering Algorithm
Implement the Soft k-Means Clustering Algorithm.
Given: Integers k and m, followed by a stiffness parameter β, followed by a set of points Data in m-dimensional space.
Return: A set Centers consisting of k points (centers) resulting from applying the soft k-means clustering algorithm. Select the first k points from Data as the first centers for the algorithm and run the algorithm for 100 steps. Results should be accurate up to three decimal places.
84.1 Sample Dataset
2 2
2.7
1.3 1.1
1.3 0.2
0.6 2.8
3.0 3.2
1.2 0.7
1.4 1.6
1.2 1.0
1.2 1.1
0.6 1.5
1.8 2.6
1.2 1.3
1.2 1.0
0.0 1.9
84.2 Sample Output
1.662 2.623
1.075 1.148
84.3 Solution
import numpy as np
from math import sqrt
import io
from typing import List, Iterator, TypeVar, Any
= TypeVar('T')
T
def read_types(file_handle: io.StringIO, data_type: type) -> Iterator[List[T]]:
for line in file_handle:
yield list(map(data_type, line.split()))
def euclidean_distance(point_a: np.ndarray, point_b: np.ndarray) -> float:
"""Euclidean distance between a pair of n-dimensional points"""
return sqrt(sum((x - y) ** 2 for x, y in zip(point_a, point_b)))
def partition_function(data_point: np.ndarray, center_points: np.ndarray, beta: float) -> np.ndarray:
= [np.exp(-beta * euclidean_distance(data_point, center)) for center in center_points]
numerator return np.array(numerator) / sum(numerator)
def hidden_matrix(data_points: np.ndarray, center_points: np.ndarray, beta: float) -> np.ndarray:
return np.array([partition_function(x, center_points, beta) for x in data_points])
def soft_k_means(data_points: List[np.ndarray], num_clusters: int, beta: float, max_iterations: int = 20) -> np.ndarray:
= np.array(data_points[:num_clusters])
center_points = np.array(data_points)
data_points for _ in range(max_iterations):
= hidden_matrix(data_points, center_points, beta)
h_matrix = [np.dot(h_matrix[:, i], data_points) for i in range(num_clusters)]
center_points = np.sum(h_matrix, 0)
sums = np.transpose(np.transpose(center_points) / sums)
center_points return center_points
= """
sample_input 2 2
2.7
1.3 1.1
1.3 0.2
0.6 2.8
3.0 3.2
1.2 0.7
1.4 1.6
1.2 1.0
1.2 1.1
0.6 1.5
1.8 2.6
1.2 1.3
1.2 1.0
0.0 1.9
"""
= io.StringIO(sample_input.strip())
file_handle = next(read_types(file_handle, int))
num_clusters, dimensions = next(read_types(file_handle, float))[0]
beta = [np.array(point) for point in read_types(file_handle, float)]
data_points for center in soft_k_means(data_points, num_clusters, beta):
print(*[f"{coord:f}" for coord in center])
85 Implement Hierarchical Clustering
Implement Hierarchical Clustering.
Given: An integer n, followed by an n_x_n distance matrix.
Return: The result of applying HierarchicalClustering to this distance matrix (using Davg), with each newly created cluster listed on each line.
85.1 Sample Dataset
7
0.00 0.74 0.85 0.54 0.83 0.92 0.89
0.74 0.00 1.59 1.35 1.20 1.48 1.55
0.85 1.59 0.00 0.63 1.13 0.69 0.73
0.54 1.35 0.63 0.00 0.66 0.43 0.88
0.83 1.20 1.13 0.66 0.00 0.72 0.55
0.92 1.48 0.69 0.43 0.72 0.00 0.80
0.89 1.55 0.73 0.88 0.55 0.80 0.00
85.2 Sample Output
4 6
5 7
3 4 6
1 2
5 7 3 4 6
1 2 5 7 3 4 6
85.3 Solution
import numpy as np
from collections import defaultdict
import io
from typing import List, Tuple, Dict, Generator
# find (first) minimum off diagonal index in an array
def find_closest_pair(distance_matrix: np.ndarray) -> Tuple[int, int]:
= np.copy(distance_matrix)
temp_matrix max() + 1)
np.fill_diagonal(temp_matrix, temp_matrix.return divmod(temp_matrix.argmin(), temp_matrix.shape[1])
def average_distances(distance_matrix: np.ndarray, i: int, j: int, size_i: int, size_j: int) -> np.ndarray:
= np.copy(distance_matrix)
temp_matrix = (temp_matrix[i, :] * size_i + temp_matrix[j, :] * size_j) / (size_i + size_j)
average = average
temp_matrix[i, :] = average
temp_matrix[:, i] = np.delete(temp_matrix, j, 0)
temp_matrix = np.delete(temp_matrix, j, 1)
temp_matrix 0)
np.fill_diagonal(temp_matrix, return temp_matrix
def get_descendants(tree: Dict[int, List[int]], node: int) -> List[int]:
= [node]
queue = []
descendants while queue:
= queue.pop(0)
current_node if current_node in tree:
queue.extend(tree[current_node])else:
descendants.append(current_node)return descendants
def hierarchical_clustering(distance_matrix: np.ndarray, num_elements: int) -> Generator[List[int], None, None]:
= list(range(1, num_elements + 1))
clusters int, List[int]] = {}
tree: Dict[= defaultdict(lambda: 1) # the number of descendants of a node
cluster_size = num_elements
next_node while len(clusters) > 1:
+= 1
next_node = find_closest_pair(distance_matrix)
i, j = clusters[i], clusters[j]
cluster_a, cluster_b = [cluster_a, cluster_b]
tree[next_node] = cluster_size[cluster_a] + cluster_size[cluster_b]
cluster_size[next_node] = average_distances(distance_matrix, *find_closest_pair(distance_matrix), cluster_size[cluster_a], cluster_size[cluster_b])
distance_matrix = next_node
clusters[i] del clusters[j]
yield get_descendants(tree, cluster_a) + get_descendants(tree, cluster_b)
= """
sample_input 7
0.00 0.74 0.85 0.54 0.83 0.92 0.89
0.74 0.00 1.59 1.35 1.20 1.48 1.55
0.85 1.59 0.00 0.63 1.13 0.69 0.73
0.54 1.35 0.63 0.00 0.66 0.43 0.88
0.83 1.20 1.13 0.66 0.00 0.72 0.55
0.92 1.48 0.69 0.43 0.72 0.00 0.80
0.89 1.55 0.73 0.88 0.55 0.80 0.00
"""
*distance_data = io.StringIO(sample_input.strip()).read().splitlines()
num_elements, = np.array([list(map(float, row.split())) for row in distance_data])
distance_matrix for step in hierarchical_clustering(distance_matrix, int(num_elements)):
print(*step)
86 Construct a Trie from a Collection of Patterns
Trie Construction Problem. Construct a trie on a collection of patterns.
Given: A collection of strings Patterns.
Return: The adjacency list corresponding to Trie(Patterns), in the following format. If Trie(Patterns) has n nodes, first label the root with 1 and then label the remaining nodes with the integers 2 through n in any order you like. Each edge of the adjacency list of Trie(Patterns) will be encoded by a triple: the first two members of the triple must be the integers labeling the initial and terminal nodes of the edge, respectively; the third member of the triple must be the symbol labeling the edge.
86.1 Sample Dataset
ATAGA
ATC
GAT
86.2 Sample Output
0->1:A
1->2:T
2->3:A
3->4:G
4->5:A
2->6:C
0->7:G
7->8:A
8->9:T
86.3 Solution
from itertools import count
from typing import List, Dict, Tuple, Any
class Trie:
def __init__(self) -> None:
self.id_generator = count(start=0)
self.root: Tuple[int, Dict[str, Any]] = (next(self.id_generator), {})
def insert(self, sequence: str) -> None:
= self.root
current_node for character in sequence:
if character not in current_node[1]:
1][character] = (next(self.id_generator), {})
current_node[= current_node[1][character]
current_node
def build_trie(sequences: List[str]) -> Tuple[int, Dict[str, Any]]:
= Trie()
trie for sequence in sequences:
trie.insert(sequence)return trie.root
def format_trie(node: Tuple[int, Dict[str, Any]]) -> List[str]:
str] = []
formatted_output: List[= node
node_id, children for char, child_node in children.items():
= child_node
child_id, _ f"{node_id}->{child_id}:{char}")
formatted_output.append(
formatted_output.extend(format_trie(child_node))return formatted_output
str = """
sample_input: ATAGA
ATC
GAT
"""
str] = sample_input.strip().split("\n")
sequences: List[
for edge in format_trie(build_trie(sequences)):
print(edge)
87 Implement TrieMatching
Given: A string Text and a collection of strings Patterns.
Return: All starting positions in Text where a string from Patterns appears as a substring.
87.1 Sample Dataset
AATCGGGTTCAATCGGGGT
ATCG
GGGT
87.2 Sample Output
1 4 11 15
87.3 Solution
from collections import defaultdict
from typing import List, DefaultDict, Set
str = """
sample_input: AATCGGGTTCAATCGGGGT
ATCG
GGGT
"""
str] = sample_input.strip().split("\n")
sequences: List[str = sequences[0]
main_sequence: str] = sequences[1:]
kmers: List[
str, Set[int]] = defaultdict(set)
kmer_positions: DefaultDict[
int = len(kmers[0])
kmer_length:
for start_index in range(len(main_sequence) - kmer_length + 1):
str = main_sequence[start_index:start_index + kmer_length]
current_kmer:
kmer_positions[current_kmer].add(start_index)
int] = set()
all_positions: Set[for kmer in kmers:
all_positions.update(kmer_positions[kmer])
int] = sorted(all_positions)
sorted_positions: List[str = " ".join(map(str, sorted_positions))
output:
print(output)
88 Construct the Suffix Tree of a String
Suffix Tree Construction Problem. Construct the suffix tree of a string.
Given: A string Text.
Return: The strings labeling the edges of SuffixTree(Text). (You may return these strings in any order.)
88.1 Sample Dataset
ATAAATG$
88.2 Sample Output
AAATG$
G$
T
ATG$
TG$
A
A
AAATG$
G$
T
G$
$
88.3 Solution
from typing import List, Optional
class Tree:
class Node:
def __init__(self) -> None:
self.label: Optional[int] = None
self.edges: List['Tree.Edge'] = []
self.indicator: Optional[str] = None
self.depth: int = 0
class Edge:
def __init__(self) -> None:
self.from_node: Optional['Tree.Node'] = None
self.target_node: Optional['Tree.Node'] = None
self.position: Optional[int] = None
self.length: Optional[int] = None
def __init__(self) -> None:
self.all_nodes: List[Tree.Node] = []
self.all_edges: List[Tree.Edge] = []
self.root: Tree.Node = self.add_node()
def add_node(self) -> Node:
= Tree.Node()
new_node = len(self.all_nodes)
new_node.label self.all_nodes.append(new_node)
return new_node
def add_edge(self, from_node: Node, target_node: Node, pos: int, length: int) -> Edge:
= Tree.Edge()
new_edge = from_node
new_edge.from_node = target_node
new_edge.target_node = pos
new_edge.position = length
new_edge.length = from_node.depth + length
target_node.depth
from_node.edges.append(new_edge)self.all_edges.append(new_edge)
return new_edge
def edge_labels(self, text: str) -> List[str]:
return [text[edge.position : edge.position + edge.length] for edge in self.all_edges]
def return_ripe_nodes(self) -> List[Node]:
= []
ripe_nodes for node in self.all_nodes:
if node.indicator is None and all(edge.target_node.indicator is not None for edge in node.edges):
ripe_nodes.append(node)return ripe_nodes
def add_indicators(self) -> None:
= self.return_ripe_nodes()
ripe_nodes while ripe_nodes:
for node in ripe_nodes:
= set(edge.target_node.indicator for edge in node.edges)
children_indicators = '*' if len(children_indicators) != 1 else next(iter(children_indicators))
node.indicator = self.return_ripe_nodes()
ripe_nodes
def construct_suffix_tree(self, trie_node: 'Trie.Node', tree_node: Node, path: List['Trie.Edge'] = []) -> None:
while len(trie_node.edges) == 1:
= trie_node.edges[0]
trie_edge
path.append(trie_edge)= trie_edge.target_node
trie_node
if path:
= self.add_node()
new_tree_node self.add_edge(tree_node, new_tree_node, path[0].position, len(path))
= new_tree_node
tree_node
if not trie_node.edges:
= trie_node.indicator
tree_node.indicator = trie_node.label
tree_node.label return
for trie_edge in trie_node.edges:
self.construct_suffix_tree(trie_edge.target_node, tree_node, [trie_edge])
def populate_suffix_tree(self, text: str) -> None:
= Trie()
suffix_trie
suffix_trie.construct_suffix_trie(text)self.construct_suffix_tree(suffix_trie.root, self.root)
class Trie:
class Node:
def __init__(self) -> None:
self.label: Optional[int] = None
self.edges: List['Trie.Edge'] = []
self.indicator: Optional[str] = None
class Edge:
def __init__(self) -> None:
self.from_node: Optional['Trie.Node'] = None
self.target_node: Optional['Trie.Node'] = None
self.label: Optional[str] = None
self.position: Optional[int] = None
def __init__(self) -> None:
self.all_nodes: List[Trie.Node] = []
self.all_edges: List[Trie.Edge] = []
self.root: Trie.Node = self.add_node()
def add_node(self) -> Node:
= Trie.Node()
new_node = len(self.all_nodes)
new_node.label self.all_nodes.append(new_node)
return new_node
def add_edge(self, from_node: Node, target_node: Node, label: str, pos: Optional[int] = None) -> Edge:
= Trie.Edge()
new_edge = from_node
new_edge.from_node = target_node
new_edge.target_node = label
new_edge.label = pos
new_edge.position
from_node.edges.append(new_edge)self.all_edges.append(new_edge)
return new_edge
def construct_suffix_trie(self, text: str) -> None:
= '#'
indicator for i in range(len(text)):
= self.root
current_node for j in range(i, len(text)):
= text[j]
current_symbol = next((edge.target_node for edge in current_node.edges if edge.label == current_symbol), None)
next_node if next_node is None:
= self.add_node()
new_node self.add_edge(current_node, new_node, current_symbol, j)
= new_node
current_node else:
= next_node
current_node if not current_node.edges:
= f'L{i}'
current_node.label = indicator
current_node.indicator if text[i] == '#':
= '$'
indicator
= "ATAAATG$"
sample_input = sample_input.strip().split()
input_lines = input_lines[0]
text
= Tree()
tree
tree.populate_suffix_tree(text)
= tree.edge_labels(text)
result print("\n".join(result))
89 Find the Longest Repeat in a String
Longest Repeat Problem, Find the longest repeat in a string.
Given: A string Text.
Return: A longest substring of Text that appears in Text more than once. (Multiple solutions may exist, in which case you may return any one.)
89.1 Sample Dataset
ATATCGTTTTATCGTT
89.2 Sample Output
TATCGTT
89.3 Solution
from functools import cache
from os.path import commonprefix
from typing import Dict, List, Tuple, Generator, Iterator
def get_edges(graph: Dict[str, Dict]) -> Generator[str, None, None]:
for key in graph.keys():
yield key
yield from get_edges(graph[key])
@cache
def suffix_tree(sequence: str, start_positions: Tuple[int, ...]) -> Dict[str, Dict]:
str, Dict] = {}
tree: Dict[= sorted(set([sequence[start] for start in start_positions]))
unique_bases
for base in unique_bases:
= [start for start in start_positions if sequence[start] == base]
matching_positions = [sequence[s:] for s in matching_positions]
subsequences = commonprefix(subsequences)
common_prefix = len(common_prefix)
prefix_length = [start + prefix_length for start in matching_positions if start + prefix_length < len(sequence)]
new_start_positions = suffix_tree(sequence, tuple(new_start_positions))
tree[common_prefix]
return tree
def create_suffix_tree(sequence: str) -> Dict[str, Dict]:
return suffix_tree(sequence, tuple(range(len(sequence))))
def internal_edges(tree: Dict[str, Dict]) -> Iterator[str]:
for node in tree.keys():
if not len(tree[node]):
yield ""
for child_edge in internal_edges(tree[node]):
yield node + child_edge
def longest_shared_substring(tree: Dict[str, Dict]) -> str:
return max(internal_edges(tree), key=lambda x: len(x))
= """
sample_input ATATCGTTTTATCGTT
"""
= sample_input.strip()
sequence = create_suffix_tree(sequence)
suffix_tree print(longest_shared_substring(suffix_tree))
92 Construct the Suffix Array of a String
Suffix Array Construction Problem. Construct the suffix array of a string.
Given: A string Text.
Return: SuffixArray(Text).
92.1 Sample Dataset
AACGATAGCGGTAGA$
92.2 Sample Output
15, 14, 0, 1, 12, 6, 4, 2, 8, 13, 3, 7, 9, 10, 11, 5
92.3 Solution
from typing import List
def create_suffix_array(text: str) -> List[int]:
"""
Create a suffix array for the given text.
Args:
text (str): The input string to create a suffix array for.
Returns:
List[int]: A list of indices representing the suffix array.
"""
= [(i, text[i:]) for i in range(len(text))]
suffixes = sorted(suffixes, key=lambda x: x[1])
sorted_suffixes return [index for index, _ in sorted_suffixes]
= "AACGATAGCGGTAGA$"
sample_input = create_suffix_array(sample_input.strip())
suffix_array print(*suffix_array, sep=", ")
93 Pattern Matching with the Suffix Array
Multiple Pattern Matching with the Suffix Array.
Given: A string Text and a collection of strings Patterns.
Return: All starting positions in Text where a string from Patterns appears as a substring.
93.1 Sample Dataset
AATCGGGTTCAATCGGGGT
ATCG
GGGT
93.2 Sample Output
1 4 11 15
93.3 Solution
from typing import List, Tuple
def create_suffix_array(text: str) -> List[int]:
"""
Create a suffix array for the given text.
Args:
text (str): The input string to create a suffix array for.
Returns:
List[int]: A list of indices representing the suffix array.
"""
int, str]] = [(i, text[i:]) for i in range(len(text))]
suffixes: List[Tuple[int, str]] = sorted(suffixes, key=lambda x: x[1])
sorted_suffixes: List[Tuple[return [index for index, _ in sorted_suffixes]
def find_pattern_occurrences(text: str, pattern: str, suffix_array: List[int]) -> List[int]:
"""
Find all occurrences of a pattern in the text using the suffix array.
Args:
text (str): The input text to search in.
pattern (str): The pattern to search for.
suffix_array (List[int]): The suffix array of the text.
Returns:
List[int]: A list of indices where the pattern occurs in the text.
"""
int = 0
lower_bound: int = len(text)
upper_bound:
while lower_bound < upper_bound:
int = (lower_bound + upper_bound) // 2
mid_point: if pattern > text[suffix_array[mid_point]:][: len(pattern)]:
= mid_point + 1
lower_bound else:
= mid_point
upper_bound
int = lower_bound
first_occurrence: = len(text)
upper_bound
while lower_bound < upper_bound:
int = (lower_bound + upper_bound) // 2
mid_point: if pattern < text[suffix_array[mid_point]:][: len(pattern)]:
= mid_point
upper_bound else:
= mid_point + 1
lower_bound
int = upper_bound
last_occurrence:
if first_occurrence > last_occurrence:
return []
else:
return list(range(first_occurrence, last_occurrence))
str = """
sample_input: AATCGGGTTCAATCGGGGT
ATCG
GGGT
"""
*patterns = sample_input.strip().split("\n")
sequence, int] = create_suffix_array(sequence)
suffix_array: List[int] = []
matching_indices: List[
for pattern in patterns:
for index in find_pattern_occurrences(sequence, pattern, suffix_array):
matching_indices.append(suffix_array[index])
print(*sorted(set(matching_indices)))
94 Construct the Burrows-Wheeler Transform of a String
Burrows-Wheeler Transform Construction Problem.Construct the Burrows-Wheeler transform of a string.
Given: A string Text.
Return: BWT(Text).
94.1 Sample Dataset
GCGTGCCTGGTCA$
94.2 Sample Output
ACTGGCT$TGCGGC
94.3 Solution
from typing import List
def create_suffix_array(text: str) -> List[int]:
"""
Create a suffix array for the given text.
Args:
text (str): The input string to create a suffix array for.
Returns:
List[int]: A list of indices representing the suffix array.
"""
= sorted(range(len(text)), key=lambda i: text[i:])
suffixes return suffixes
def burrows_wheeler_transform(sequence: str) -> str:
"""
Perform the Burrows-Wheeler Transform on the input sequence.
Args:
sequence (str): The input string to transform.
Returns:
str: The Burrows-Wheeler Transform of the input sequence.
"""
= create_suffix_array(sequence)
suffix_array return ''.join(sequence[i - 1] for i in suffix_array)
= "GCGTGCCTGGTCA$"
sample_input = burrows_wheeler_transform(sample_input.strip())
transformed_sequence print(transformed_sequence)
95 Reconstruct a String from its Burrows-Wheeler Transform
Inverse Burrows-Wheeler Transform Problem. Reconstruct a string from its Burrows-Wheeler transform.
Given: A string Transform (with a single “$” sign).
Return: The string Text such that BWT(Text) = Transform.
95.1 Sample Dataset
TTCCTAACG$A
95.2 Sample Output
TACATCACGT$
95.3 Solution
from collections import defaultdict
from typing import List, Tuple, Generator
def index_characters(sequence: str) -> Generator[Tuple[str, int], None, None]:
"""
Generate each character with its occurrence number in the sequence.
Args:
sequence (str): The input string to index.
Yields:
Tuple[str, int]: A tuple containing the character and its occurrence number.
"""
= defaultdict(int)
char_counts for char in sequence:
yield char, char_counts[char]
+= 1
char_counts[char]
def burrows_wheeler_transform_inverse(bwt_sequence: str) -> str:
"""
Perform the inverse Burrows-Wheeler Transform on the input sequence.
Args:
bwt_sequence (str): The Burrows-Wheeler transformed string.
Returns:
str: The original string before BWT.
"""
= list(index_characters(sorted(bwt_sequence)))
first_column = list(index_characters(bwt_sequence))
last_column
str, int] = ("$", 0)
current_char: Tuple[str] = []
original_sequence: List[
for _ in range(len(bwt_sequence)):
= first_column[last_column.index(current_char)]
current_char 0])
original_sequence.append(current_char[
return ''.join(original_sequence)
= "TTCCTAACG$A"
sample_input = burrows_wheeler_transform_inverse(sample_input.strip())
original_sequence print(original_sequence)
96 Generate the Last-to-First Mapping of a String
Last-to-First Mapping Problem.
Given: A string Transform and an integer i.
Return: The position LastToFirst(i) in FirstColumn in the Burrows-Wheeler matrix if LastColumn = Transform.
96.1 Sample Dataset
T$GACCA
3
96.2 Sample Output
1
96.3 Solution
from collections import defaultdict
from typing import List, Tuple, Generator
def index_characters(sequence: str) -> Generator[Tuple[str, int], None, None]:
"""
Generate each character with its occurrence number in the sequence.
Args:
sequence (str): The input string to index.
Yields:
Tuple[str, int]: A tuple containing the character and its occurrence number.
"""
= defaultdict(int)
char_counts for char in sequence:
yield char, char_counts[char]
+= 1
char_counts[char]
def last_to_first_mapping(sequence: str, index: int) -> int:
"""
Find the mapping from Last column to First column in the Burrows-Wheeler Transform matrix.
Args:
sequence (str): The input string (Last column of BWT matrix).
index (int): The index in the Last column.
Returns:
int: The corresponding index in the First column.
"""
= list(index_characters(sorted(sequence)))
first_column = list(index_characters(sequence))
last_column return first_column.index(last_column[index])
= """
sample_input T$GACCA
3
"""
= sample_input.strip().split("\n")
sequence, index_str = int(index_str)
index
= last_to_first_mapping(sequence.strip(), index)
result print(result)
97 Implement BWMatching
Implement. BWMatching.
Given: A string BWT(Text), followed by a collection of strings Patterns.
Return: A list of integers, where the i-th integer corresponds to the number of substring matches of the i-th member of Patterns in Text.
97.1 Sample Dataset
TCCTCTATGAGATCCTATTCTATGAAACCTTCA$GACCAAAATTCTCCGGC
CCT CAC GAG CAG ATC
97.2 Sample Output
2 1 1 0 1
97.3 Solution
from typing import List, Dict, Tuple
def BWMatching(bwt: str, patterns: List[str]) -> List[int]:
def create_count_and_first_occurrence(bwt: str) -> Tuple[Dict[str, List[int]], Dict[str, int]]:
str, List[int]] = {char: [0] * (len(bwt) + 1) for char in set(bwt)}
count: Dict[str, int] = {}
first_occurrence: Dict[str] = sorted(bwt)
sorted_bwt: List[
for i, char in enumerate(bwt):
for c in count:
+ 1] = count[c][i]
count[c][i + 1] += 1
count[char][i
for i, char in enumerate(sorted_bwt):
if char not in first_occurrence:
= i
first_occurrence[char]
return count, first_occurrence
def count_matches(pattern: str) -> int:
int = 0
top: int = len(bwt) - 1
bottom: while top <= bottom:
if pattern:
str = pattern[-1]
symbol: = pattern[:-1]
pattern if symbol in bwt[top:bottom+1]:
= first_occurrence[symbol] + count[symbol][top]
top = first_occurrence[symbol] + count[symbol][bottom+1] - 1
bottom else:
return 0
else:
return bottom - top + 1
= create_count_and_first_occurrence(bwt)
count, first_occurrence return [count_matches(pattern) for pattern in patterns]
# Sample input processing
str = """
sample_input: TCCTCTATGAGATCCTATTCTATGAAACCTTCA$GACCAAAATTCTCCGGC
CCT CAC GAG CAG ATC
"""
str] = sample_input.strip().split("\n")
input_lines: List[str = input_lines[0]
bwt: str] = input_lines[1].split()
patterns: List[
# Run the BWMatching algorithm
int] = BWMatching(bwt, patterns)
result: List[print(" ".join(map(str, result)))
98 Implement BetterBWMatching
Given: A string BWT(Text), followed by a collection of strings Patterns.
Return: A list of integers, where the i-th integer corresponds to the number of substring matches of the i-th member of Patterns in Text.
98.1 Sample Dataset
GGCGCCGC$TAGTCACACACGCCGTA
ACC CCG CAG
98.2 Sample Output
1 2 1
98.3 Solution
from typing import List, Dict, Tuple
def BetterBWMatching(bwt: str, patterns: List[str]) -> List[int]:
def preprocess_bwt(bwt: str) -> Tuple[List[str], Dict[str, List[int]], Dict[str, int]]:
str] = sorted(bwt)
first_column: List[str, List[int]] = {char: [0] * (len(bwt) + 1) for char in set(bwt)}
count: Dict[str, int] = {char: first_column.index(char) for char in set(bwt)}
start_index: Dict[
for i, char in enumerate(bwt):
for c in count:
+ 1] = count[c][i]
count[c][i + 1] += 1
count[char][i
return first_column, count, start_index
def count_occurrences(pattern: str) -> int:
int = 0
top: int = len(bwt) - 1
bottom:
while top <= bottom:
if pattern:
str = pattern[-1]
symbol: = pattern[:-1]
pattern if symbol in bwt[top:bottom+1]:
= start_index[symbol] + count[symbol][top]
top = start_index[symbol] + count[symbol][bottom+1] - 1
bottom else:
return 0
else:
return bottom - top + 1
= preprocess_bwt(bwt)
first_column, count, start_index return [count_occurrences(pattern) for pattern in patterns]
# Sample input processing
str = """
sample_input: GGCGCCGC$TAGTCACACACGCCGTA
ACC CCG CAG
"""
str] = sample_input.strip().split("\n")
input_lines: List[str = input_lines[0]
bwt: str] = input_lines[1].split()
patterns: List[
# Run the BetterBWMatching algorithm
int] = BetterBWMatching(bwt, patterns)
result: List[print(" ".join(map(str, result)))
99 Find All Occurrences of a Collection of Patterns in a String
Multiple Pattern Matching Problem. Find all occurrences of a collection of patterns in a text.
Given: A string Text and a collection of strings Patterns.
Return: All starting positions in Text where a string from Patterns appears as a substring.
99.1 Sample Dataset
AATCGGGTTCAATCGGGGT
ATCG
GGGT
99.2 Sample Output
1 4 11 15
99.3 Solution
from typing import List, Dict, Tuple
def burrows_wheeler_transform(text: str) -> str:
= len(text)
n = sorted([text[i:] + text[:i] for i in range(n)])
rotations = ''.join([rot[-1] for rot in rotations])
bwt return bwt
def create_checkpoint_array(bwt: str, checkpoint_interval: int) -> Dict[int, Dict[str, int]]:
= list(set(bwt))
symbols = {}
checkpoint_array for idx in range(0, len(bwt), checkpoint_interval):
= {symbol: bwt[:idx].count(symbol) for symbol in symbols}
checkpoint_array[idx] return checkpoint_array
def count_symbol(checkpoint_array: Dict[int, Dict[str, int]], idx: int, last_column: str, symbol: str) -> int:
= [x for x in checkpoint_array.keys() if x <= idx]
checkpoints = max(checkpoints)
nearest_checkpoint = checkpoint_array[nearest_checkpoint][symbol]
count += last_column[nearest_checkpoint:idx].count(symbol)
count return count
def create_partial_suffix_array(text: str, k: int) -> Dict[int, int]:
= [(text[i:], i) for i in range(len(text))]
suffixes = sorted(suffixes)
sorted_suffixes return {i: pos for i, (_, pos) in enumerate(sorted_suffixes) if pos % k == 0}
def multiple_pattern_matching(first_occurrence: Dict[str, int], last_column: str, pattern: str, checkpoint_array: Dict[int, Dict[str, int]]) -> Tuple[int, int]:
= 0
top = len(last_column) - 1
bottom
while top <= bottom:
if pattern:
= pattern[-1]
symbol = pattern[:-1]
pattern if symbol in last_column[top: bottom + 1]:
= first_occurrence[symbol] + count_symbol(checkpoint_array, top, last_column, symbol)
top = first_occurrence[symbol] + count_symbol(checkpoint_array, bottom + 1, last_column, symbol) - 1
bottom else:
return -1, -1
else:
return top, bottom
return -1, -1
def find_pattern_occurrences(text: str, patterns: List[str], checkpoint_interval: int = 100) -> List[int]:
= burrows_wheeler_transform(text + '$')
bwt
= {}
first_occurrence for idx, symbol in enumerate(sorted(bwt)):
if symbol not in first_occurrence:
= idx
first_occurrence[symbol]
= create_checkpoint_array(bwt, checkpoint_interval)
checkpoint_array = create_partial_suffix_array(text + '$', checkpoint_interval)
partial_suffix_array
= []
positions for pattern in patterns:
= multiple_pattern_matching(first_occurrence, bwt, pattern, checkpoint_array)
top, bottom if top != -1:
for idx in range(top, bottom + 1):
= 0
offset while idx not in partial_suffix_array:
= first_occurrence[bwt[idx]] + count_symbol(checkpoint_array, idx, bwt, bwt[idx])
idx += 1
offset + offset)
positions.append(partial_suffix_array[idx]
return sorted(positions)
# Sample input processing
str = """
sample_input: AATCGGGTTCAATCGGGGT
ATCG
GGGT
"""
= sample_input.strip().split("\n")
input_lines = input_lines[0]
text = input_lines[1:]
patterns
= find_pattern_occurrences(text, patterns)
result print(' '.join(str(pos) for pos in result))
100 Find All Approximate Occurrences of a Collection of Patterns in a String
Multiple Approximate Pattern Matching Problem. Find all approximate occurrences of a collection of patterns in a text.
Given: A string Text, a collection of strings Patterns, and an integer d.
Return: All positions in Text where a string from Patterns appears as a substring with at most d mismatches.
100.1 Sample Dataset
ACATGCTACTTT
ATT GCC GCTA TATT
1
100.2 Sample Output
2 4 4 6 7 8 9
100.3 Solution
from collections import defaultdict
from copy import copy
from typing import Dict, List, Tuple, Iterator
def suffix_array(text: str) -> List[int]:
int, str] = {i: text[i:] for i in range(len(text))}
suffixes: Dict[return sorted(suffixes.keys(), key=lambda x: suffixes[x])
def partial_suffix_array(sequence: str, k: int) -> List[Tuple[int, int]]:
return [(i, x) for i, x in enumerate(suffix_array(sequence)) if x % k == 0]
def burrows_wheeler_transform(sequence: str) -> str:
return "".join(sequence[i - 1] for i in suffix_array(sequence))
from itertools import accumulate
def first_occurrence(sequence: str) -> Dict[str, int]:
str] = sorted(set(sequence))
unique_letters: List[int] = [0] + list(accumulate(sequence.count(x) for x in unique_letters))
counts: List[return dict(zip(unique_letters, counts))
def count_symbols(sequence: str) -> List[Dict[str, int]]:
str, int]] = []
count: List[Dict[int))
count.append(defaultdict(for i, symbol in enumerate(sequence):
count.append(copy(count[i]))+ 1][symbol] += 1
count[i return count
def find_location(row: int, psa: Dict[int, int], last_column: str, fo: Dict[str, int], cs: List[Dict[str, int]]) -> int:
int = 0
steps: while row not in psa:
str = last_column[row]
predecessor: = fo[predecessor] + cs[row][predecessor]
row += 1
steps return steps + psa[row]
class BWMatch:
def __init__(self, sequence: str, k: int = 10):
self.psa: Dict[int, int] = dict(partial_suffix_array(sequence + "$", k))
self.sequence: str = burrows_wheeler_transform(sequence + "$")
self.fo: Dict[str, int] = first_occurrence(self.sequence)
self.cs: List[Dict[str, int]] = count_symbols(self.sequence)
self.max_mismatches: int = 0
def update(self, pointers: Tuple[int, int], x: str) -> Tuple[int, int]:
= pointers
top, bottom return (self.fo[x] + self.cs[top][x], self.fo[x] + self.cs[bottom + 1][x] - 1)
def bwm(self, pattern: str, pointers: Tuple[int, int], mismatch_count: int) -> List[int]:
if not pattern:
return list(range(pointers[0], pointers[1] + 1))
int] = []
matches: List[= pattern[:-1], pattern[-1]
pattern, symbol if symbol in self.sequence[pointers[0] : pointers[1] + 1]:
+= self.bwm(pattern, self.update(pointers, symbol), mismatch_count)
matches if mismatch_count < self.max_mismatches:
for mismatch in ["A", "C", "G", "T"]:
if mismatch != symbol:
+= self.bwm(pattern, self.update(pointers, mismatch), mismatch_count + 1)
matches return matches
def match_patterns(self, patterns: List[str], max_mismatches: int) -> Iterator[int]:
self.max_mismatches = max_mismatches
for pattern in patterns:
for match in self.bwm(pattern, (0, len(self.sequence) - 1), 0):
yield find_location(match, self.psa, self.sequence, self.fo, self.cs)
str = """
sample_input: ACATGCTACTTT
ATT GCC GCTA TATT
1
"""
= sample_input.strip().split("\n")
sequence, patterns, mismatches str] = patterns.split()
patterns: List[int = int(mismatches)
mismatches: = BWMatch(sequence)
matcher: BWMatch print(*sorted(matcher.match_patterns(patterns, mismatches)))
101 Implement TreeColoring
Tree Coloring Problem. Color the internal nodes of a suffix tree given colors of the leaves.
Given: An adjacency list, followed by color labels for leaf nodes.
Return: Color labels for all nodes, in any order.
101.1 Sample Dataset
0 -> {}
1 -> {}
2 -> 0,1
3 -> {}
4 -> {}
5 -> 3,2
6 -> {}
7 -> 4,5,6
-
0: red
1: red
3: blue
4: blue
6: red
101.2 Sample Output
0: red
1: red
2: red
3: blue
4: blue
5: purple
6: red
7: purple
101.3 Solution
from collections import defaultdict
import io
from typing import Dict, List, Union, TextIO
def color_tree(tree: Dict[int, List[Dict[str, int]]], node_colors: Dict[int, Union[str, None]]) -> Dict[int, Union[str, None]]:
= list(tree.keys())
uncolored_nodes while uncolored_nodes:
for node in list(uncolored_nodes): # Create a copy of uncolored_nodes to iterate over
= [node_colors[child["n"]] for child in tree[node] if node_colors[child["n"]] is not None]
child_colors if len(child_colors) == len(tree[node]): # Check if all children are colored
if all(color == "red" for color in child_colors):
= "red"
node_colors[node] elif all(color == "blue" for color in child_colors):
= "blue"
node_colors[node] else:
= "purple"
node_colors[node]
uncolored_nodes.remove(node)return node_colors
def parse_input(input_data: Union[str, TextIO]) -> Tuple[Dict[int, List[Dict[str, int]]], Dict[int, Union[str, None]]]:
if isinstance(input_data, str):
if '\n' in input_data:
# If input_data contains newlines, treat it as a string input
= io.StringIO(input_data)
file_obj else:
# Otherwise, treat it as a filename
= open(input_data)
file_obj elif isinstance(input_data, TextIO):
= input_data
file_obj else:
raise ValueError("Input must be a filename, a string, or a file-like object")
int, List[Dict[str, int]]] = defaultdict(list)
tree: Dict[int, Union[str, None]] = defaultdict(lambda: None)
node_colors: Dict[
= True
parsing_tree for line in file_obj:
= line.strip()
line if line == "-":
= False
parsing_tree continue
if parsing_tree:
if " -> " in line:
= line.split(" -> ")
parent, children if children != "{}":
for child in children.split(","):
int(parent)].append({"n": int(child)})
tree[else:
= line.split(": ")
node, color int(node)] = color
node_colors[
if isinstance(input_data, str) and '\n' not in input_data:
file_obj.close()
return tree, node_colors
= """
sample_input 0 -> {}
1 -> {}
2 -> 0,1
3 -> {}
4 -> {}
5 -> 3,2
6 -> {}
7 -> 4,5,6
-
0: red
1: red
3: blue
4: blue
6: red
"""
= parse_input(sample_input)
tree, node_colors = color_tree(tree, node_colors)
colored_tree for node in sorted(colored_tree.keys()):
print(f"{node}: {colored_tree[node]}")
102 Construct the Partial Suffix Array of a String
Partial Suffix Array Construction Problem. Construct the partial suffix array of a string.
Given: A string Text and a positive integer K.
Return: *SuffixArray__K(Text), in the form of a list of ordered pairs (i, SuffixArray(i*)) for all nonempty entries in the partial suffix array.
102.1 Sample Dataset
PANAMABANANAS$
5
102.2 Sample Output
1,5
11,10
12,0
102.3 Solution
from typing import List, Dict, Tuple
def suffix_array(text: str) -> List[int]:
int, str] = {i: text[i:] for i in range(len(text))}
suffixes: Dict[return sorted(suffixes.keys(), key=lambda x: suffixes[x])
def partial_suffix_array(sequence: str, step: int) -> List[Tuple[int, int]]:
return [(index, position) for index, position in enumerate(suffix_array(sequence)) if position % step == 0]
str = """
sample_input: PANAMABANANAS$
5
"""
= sample_input.strip().split("\n")
sequence, step_str int = int(step_str)
step:
for entry in partial_suffix_array(sequence, step):
print(*entry, sep=",")
103 Construct a Suffix Tree from a Suffix Array
Suffix Tree Construction from Suffix Array Problem. Construct a suffix tree from the suffix array and LCP array of a string.
Given: A string Text, SuffixArray(Text), and LCP(Text).
Return: The strings labeling the edges of SuffixTree(Text). (You may return these strings in any order.)
103.1 Sample Dataset
GTAGT$
5, 2, 3, 0, 4, 1
0, 0, 0, 2, 0, 1
103.2 Sample Output
$
$
$
AGT$
AGT$
AGT$
GT
T
103.3 Solution
from typing import Dict, List, Union
class Node:
def __init__(self, parent: Union[int, str, None] = None, label: str = ""):
self.parent: Union[int, str, None] = parent
self.label: str = label
def calculate_depth(tree: Dict[Union[int, str], Node], node: Union[int, str]) -> int:
"""
Calculate the length of the concatenation of all path labels from the root to node
"""
int = len(tree[node].label)
depth: while tree[node].parent is not None:
= tree[node].parent
node += len(tree[node].label)
depth return depth
def construct_suffix_tree(text: str, suffix_array: List[int], lcp_array: List[int]) -> Dict[Union[int, str], Node]:
int, str], Node] = {-1: Node()}
tree: Dict[Union[
for i in range(len(text)):
int = i - 1
current_node: while tree[current_node].parent is not None and calculate_depth(tree, current_node) > lcp_array[i]:
= tree[current_node].parent
current_node int = calculate_depth(tree, current_node)
current_depth:
if lcp_array[i] == current_depth:
= Node(current_node, text[suffix_array[i] + lcp_array[i]:])
tree[i] else:
int = i - 1
temp_node: while tree[temp_node].parent is not None and tree[temp_node].parent != current_node:
= tree[temp_node].parent
temp_node
str = f"y{i}"
new_node_key: = Node(current_node, text[suffix_array[i - 1] + current_depth : suffix_array[i - 1] + lcp_array[i]])
tree[new_node_key] = Node(new_node_key, text[suffix_array[i - 1] + lcp_array[i] : suffix_array[i - 1] + calculate_depth(tree, temp_node)])
tree[temp_node] = Node(new_node_key, text[suffix_array[i] + lcp_array[i]:])
tree[i]
del tree[-1]
return tree
str = """
sample_input: GTAGT$
5, 2, 3, 0, 4, 1
0, 0, 0, 2, 0, 1
"""
= sample_input.strip().split("\n")
text, suffix_array_str, lcp_array_str int] = [int(x) for x in suffix_array_str.split(", ")]
suffix_array: List[int] = [int(x) for x in lcp_array_str.split(", ")]
lcp_array: List[
int, str], Node] = construct_suffix_tree(text, suffix_array, lcp_array)
suffix_tree: Dict[Union[str] = [suffix_tree[key].label for key in suffix_tree.keys()]
labels: List[print(*sorted(labels), sep="\n")
106 Implement the Viterbi Algorithm
Given: A string x, followed by the alphabet Σ from which x was constructed, followed by the states States, transition matrix Transition, and emission matrix Emission of an HMM (Σ, States, Transition, Emission).
Return: A path that maximizes the (unconditional) probability Pr(x, π) over all possible paths π.
106.1 Sample Dataset
xyxzzxyxyy
--------
x y z
--------
A B
--------
A B
A 0.641 0.359
B 0.729 0.271
--------
x y z
A 0.117 0.691 0.192
B 0.097 0.42 0.483
106.2 Sample Output
AAABBAAAAA
106.3 Solution
from io import StringIO
from math import log
from typing import List, Dict, Tuple, Iterator
import numpy as np
def parse_input(input_iterator: Iterator[str]) -> Tuple[str, List[str], Dict[Tuple[str, str], float], Dict[Tuple[str, str], float]]:
= next(input_iterator).rstrip()
sequence next(input_iterator)
= next(input_iterator).split()
alphabet next(input_iterator)
= next(input_iterator).split()
states next(input_iterator)
= [next(input_iterator) for _ in range(len(states) + 1)]
transition_lines = {
transition_matrix float(value)
(states[i], states[j]): for i, row in enumerate(transition_lines[1:])
for j, value in enumerate(row.split()[1:])
}
next(input_iterator)
= [next(input_iterator) for _ in range(len(states) + 1)]
emission_lines = {
emission_matrix float(value)
(states[i], alphabet[j]): for i, row in enumerate(emission_lines[1:])
for j, value in enumerate(row.split()[1:])
}
return sequence, states, transition_matrix, emission_matrix
def viterbi(sequence: str, states: List[str], transition_matrix: Dict[Tuple[str, str], float], emission_matrix: Dict[Tuple[str, str], float]) -> str:
= len(states)
num_states = len(sequence)
sequence_length = np.zeros((sequence_length, num_states))
viterbi_matrix = np.zeros((sequence_length, num_states), dtype=int)
backpointer
# Initialize the first column of the viterbi matrix
for i, state in enumerate(states):
0, i] = log(emission_matrix[state, sequence[0]] / num_states)
viterbi_matrix[
# Fill in the rest of the viterbi matrix
for t in range(1, sequence_length):
for j, current_state in enumerate(states):
= [
probabilities +
log(transition_matrix[previous_state, current_state]) +
log(emission_matrix[current_state, sequence[t]]) -1, k]
viterbi_matrix[tfor k, previous_state in enumerate(states)
]= probabilities.index(max(probabilities))
max_prob_index = max_prob_index
backpointer[t, j] = max(probabilities)
viterbi_matrix[t, j]
= np.argmax(viterbi_matrix[-1, :])
best_path_index = states[best_path_index]
best_path for t in range(sequence_length - 1, 0, -1):
= backpointer[t, best_path_index]
best_path_index = states[best_path_index] + best_path
best_path
return best_path
# Example usage
= """
sample_input xyxzzxyxyy
--------
x y z
--------
A B
--------
A B
A 0.641 0.359
B 0.729 0.271
--------
x y z
A 0.117 0.691 0.192
B 0.097 0.42 0.483
"""
= iter(StringIO(sample_input.strip()).readlines())
input_lines = parse_input(input_lines)
sequence, states, transition_matrix, emission_matrix = viterbi(sequence, states, transition_matrix, emission_matrix)
result print(result)
107 Compute the Probability of a String Emitted by an HMM
Given: A string x, followed by the alphabet Σ from which x was constructed, followed by the states States, transition matrix Transition, and emission matrix Emission of an HMM (Σ, States, Transition, Emission).
Return: The probability Pr(x) that the HMM emits x.
107.1 Sample Dataset
xzyyzzyzyy
--------
x y z
--------
A B
--------
A B
A 0.303 0.697
B 0.831 0.169
--------
x y z
A 0.533 0.065 0.402
B 0.342 0.334 0.324
107.2 Sample Output
1.1005510319694847e-06
107.3 Solution
from io import StringIO
import numpy as np
def parse_hmm_input(input_iterator):
= next(input_iterator).rstrip()
sequence next(input_iterator)
= next(input_iterator).split()
alphabet next(input_iterator)
= next(input_iterator).split()
states next(input_iterator)
= [next(input_iterator) for _ in range(len(states) + 1)]
transition_lines = {
transition_matrix float(value)
(states[i], states[j]): for i, row in enumerate(transition_lines[1:])
for j, value in enumerate(row.split()[1:])
}
next(input_iterator)
= [next(input_iterator) for _ in range(len(states) + 1)]
emission_lines = {
emission_matrix float(value)
(states[i], alphabet[j]): for i, row in enumerate(emission_lines[1:])
for j, value in enumerate(row.split()[1:])
}
return sequence, states, transition_matrix, emission_matrix
def calculate_hmm_likelihood(sequence, states, transition_matrix, emission_matrix):
= np.ones((len(sequence) + 1, len(states)))
probability_matrix
for i, state in enumerate(states):
0, i] = emission_matrix[state, sequence[0]] / len(states)
probability_matrix[
for i, emission in enumerate(sequence[1:], start=1):
for j, current_state in enumerate(states):
= sum(
probability_matrix[i, j] *
transition_matrix[previous_state, current_state] *
emission_matrix[current_state, emission] - 1, k]
probability_matrix[i for k, previous_state in enumerate(states)
)
return sum(probability_matrix[i, :])
= """
sample_input xzyyzzyzyy
--------
x y z
--------
A B
--------
A B
A 0.303 0.697
B 0.831 0.169
--------
x y z
A 0.533 0.065 0.402
B 0.342 0.334 0.324
"""
= iter(StringIO(sample_input.strip()).readlines())
input_lines = parse_hmm_input(input_lines)
sequence, states, transition_matrix, emission_matrix = calculate_hmm_likelihood(sequence, states, transition_matrix, emission_matrix)
result print(result)
108 Construct a Profile HMM
Given: A threshold θ, followed by an alphabet Σ, followed by a multiple alignment Alignment whose strings are formed from Σ.
Return: The transition and emission probabilities of the profile HMM HMM(Alignment, θ).
108.1 Sample Dataset
0.289
--------
A B C D E
--------
EBA
EBD
EB-
EED
EBD
EBE
E-D
EBD
108.2 Sample Output
S I0 M1 D1 I1 M2 D2 I2 M3 D3 I3 E
S 0 0 1.0 0 0 0 0 0 0 0 0 0
I0 0 0 0 0 0 0 0 0 0 0 0 0
M1 0 0 0 0 0 0.875 0.125 0 0 0 0 0
D1 0 0 0 0 0 0 0 0 0 0 0 0
I1 0 0 0 0 0 0 0 0 0 0 0 0
M2 0 0 0 0 0 0 0 0 0.857 0.143 0 0
D2 0 0 0 0 0 0 0 0 1.0 0 0 0
I2 0 0 0 0 0 0 0 0 0 0 0 0
M3 0 0 0 0 0 0 0 0 0 0 0 1.0
D3 0 0 0 0 0 0 0 0 0 0 0 1.0
I3 0 0 0 0 0 0 0 0 0 0 0 0
E 0 0 0 0 0 0 0 0 0 0 0 0
--------
A B C D E
S 0 0 0 0 0
I0 0 0 0 0 0
M1 0 0 0 0 1.0
D1 0 0 0 0 0
I1 0 0 0 0 0
M2 0 0.857 0 0 0.143
D2 0 0 0 0 0
I2 0 0 0 0 0
M3 0.143 0 0 0.714 0.143
D3 0 0 0 0 0
I3 0 0 0 0 0
E 0 0 0 0 0
108.3 Solution
import numpy as np
from io import StringIO
from typing import List, Tuple, Iterator
def parse_hmm_input(input_iterator: Iterator[str]) -> Tuple[float, List[str], np.ndarray]:
= float(next(input_iterator).rstrip())
threshold next(input_iterator)
= next(input_iterator).split()
alphabet next(input_iterator)
= np.array([list(sequence.strip()) for sequence in input_iterator])
alignment return threshold, alphabet, alignment
def calculate_state_index(position: int, state_type: str) -> int:
if state_type == "ins":
return (position + 1) * 3 + 1
else:
return {"match": 0, "del": 1}[state_type] + 3 * position + 2
def normalize_row(row: np.ndarray, include_zeros: bool = False, min_value: float = 0.0) -> np.ndarray:
if include_zeros and sum(row) == 0:
= 1
row[:] with np.errstate(divide="ignore", invalid="ignore"):
= row / sum(row)
normalized == 0.0] = min_value
normalized[row return normalized
def normalize_matrix(matrix: np.ndarray, include_zeros: bool = False, min_value: float = 0.0) -> np.ndarray:
return np.array([normalize_row(row, include_zeros=include_zeros, min_value=min_value) for row in matrix])
def print_matrix(matrix: np.ndarray, row_labels: List[str], column_labels: List[str]) -> None:
print(*column_labels, sep="\t")
for i, row in enumerate(matrix):
= [row_labels[i]] + [round(x, 3) if x > 0.0 else "0" for x in row]
formatted_row print(*formatted_row, sep="\t")
def print_transition_probabilities(transition_matrix: np.ndarray) -> None:
= (transition_matrix.shape[0] - 3) // 3
n
print_matrix(transition_matrix, generate_state_labels(n), generate_state_labels(n))
def print_emission_probabilities(emission_matrix: np.ndarray, alphabet: List[str]) -> None:
= (emission_matrix.shape[0] - 3) // 3
n
print_matrix(emission_matrix, generate_state_labels(n), alphabet)
def generate_state_labels(n: int) -> List[str]:
= ["S", "I0"]
labels for i in range(1, n + 1):
+= [f"M{i}", f"D{i}", f"I{i}"]
labels "E")
labels.append(return labels
def create_transition_matrix(n: int) -> np.ndarray:
return np.zeros((n * 3 + 3, n * 3 + 3), dtype=float)
def create_emission_matrix(n: int, m: int) -> np.ndarray:
return np.zeros((n * 3 + 3, m), dtype=float)
def build_profile_hmm(threshold: float, alphabet: List[str], alignment: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
= np.mean(alignment == "-", axis=0) < threshold
valid_columns = sum(valid_columns)
valid_column_count = valid_column_count * 3 + 2
end_state = create_transition_matrix(valid_column_count)
transition_probs = create_emission_matrix(valid_column_count, len(alphabet))
emission_probs
for sequence in alignment:
= 0
prev_index = -1
column_index for i, char in enumerate(sequence):
if valid_columns[i]:
+= 1
column_index if char == "-":
= calculate_state_index(column_index, "del")
current_index else:
= calculate_state_index(column_index, "match")
current_index += 1
transition_probs[prev_index, current_index] = current_index
prev_index else:
if char != "-":
= calculate_state_index(column_index, "ins")
current_index += 1
transition_probs[prev_index, current_index] = current_index
prev_index if char != "-":
+= 1
emission_probs[current_index, alphabet.index(char)] += 1
transition_probs[prev_index, end_state]
= normalize_matrix(transition_probs)
transition_probs = normalize_matrix(emission_probs)
emission_probs
return transition_probs, emission_probs
= """
sample_input 0.289
--------
A B C D E
--------
EBA
EBD
EB-
EED
EBD
EBE
E-D
EBD
"""
= iter(StringIO(sample_input.strip()).readlines())
input_lines = parse_hmm_input(input_lines)
threshold, alphabet, alignment = build_profile_hmm(threshold, alphabet, alignment)
transition_probs, emission_probs
print_transition_probabilities(transition_probs)print("--------")
print_emission_probabilities(emission_probs, alphabet)
109 Construct a Profile HMM with Pseudocounts
Given: A threshold θ and a pseudocount σ, followed by an alphabet Σ, followed by a multiple alignment Alignment whose strings are formed from Σ.
Return: The transition and emission probabilities of the profile HMM HMM(Alignment, θ, σ).
109.1 Sample Dataset
0.358 0.01
--------
A B C D E
--------
ADA
ADA
AAA
ADC
-DA
D-A
109.2 Sample Output
S I0 M1 D1 I1 M2 D2 I2 M3 D3 I3 E
S 0 0.01 0.819 0.172 0 0 0 0 0 0 0 0
I0 0 0.333 0.333 0.333 0 0 0 0 0 0 0 0
M1 0 0 0 0 0.01 0.786 0.204 0 0 0 0 0
D1 0 0 0 0 0.01 0.981 0.01 0 0 0 0 0
I1 0 0 0 0 0.333 0.333 0.333 0 0 0 0 0
M2 0 0 0 0 0 0 0 0.01 0.981 0.01 0 0
D2 0 0 0 0 0 0 0 0.01 0.981 0.01 0 0
I2 0 0 0 0 0 0 0 0.333 0.333 0.333 0 0
M3 0 0 0 0 0 0 0 0 0 0 0.01 0.99
D3 0 0 0 0 0 0 0 0 0 0 0.5 0.5
I3 0 0 0 0 0 0 0 0 0 0 0.5 0.5
E 0 0 0 0 0 0 0 0 0 0 0 0
--------
A B C D E
S 0 0 0 0 0
I0 0.2 0.2 0.2 0.2 0.2
M1 0.771 0.01 0.01 0.2 0.01
D1 0 0 0 0 0
I1 0.2 0.2 0.2 0.2 0.2
M2 0.2 0.01 0.01 0.771 0.01
D2 0 0 0 0 0
I2 0.2 0.2 0.2 0.2 0.2
M3 0.803 0.01 0.168 0.01 0.01
D3 0 0 0 0 0
I3 0.2 0.2 0.2 0.2 0.2
E 0 0 0 0 0
109.3 Solution
import numpy as np
from io import StringIO
from typing import List, Tuple
def normalize_matrix(matrix: np.ndarray, include_zeros: bool = False, min_value: float = 0.0) -> np.ndarray:
return np.array([normalize_row(row, include_zeros=include_zeros, min_value=min_value) for row in matrix])
def normalize_row(row: np.ndarray, include_zeros: bool = False, min_value: float = 0.0) -> np.ndarray:
if include_zeros and sum(row) == 0:
= 1
row[:] with np.errstate(divide="ignore", invalid="ignore"):
= row / sum(row)
normalized == 0.0] = min_value
normalized[row return normalized
def print_matrix(matrix: np.ndarray, row_labels: List[str], col_labels: List[str]) -> None:
print(*col_labels, sep="\t")
for i, row in enumerate(matrix):
= [row_labels[i]] + [round(x, 3) if x > 0.0 else "0" for x in row]
formatted_row print(*formatted_row, sep="\t")
def print_transition_probs(transition_matrix: np.ndarray) -> None:
= (transition_matrix.shape[0] - 3) // 3
n
print_matrix(transition_matrix, generate_state_labels(n), generate_state_labels(n))
def print_emission_probs(emission_matrix: np.ndarray, alphabet: List[str]) -> None:
= (emission_matrix.shape[0] - 3) // 3
n
print_matrix(emission_matrix, generate_state_labels(n), alphabet)
def generate_state_labels(n: int) -> List[str]:
= ["S", "I0"]
labels for i in range(1, n + 1):
+= [f"M{i}", f"D{i}", f"I{i}"]
labels "E")
labels.append(return labels
def create_transition_matrix(n: int) -> np.ndarray:
return np.zeros((n * 3 + 3, n * 3 + 3), dtype=float)
def create_emission_matrix(n: int, m: int) -> np.ndarray:
return np.zeros((n * 3 + 3, m), dtype=float)
def calculate_state_index(position: int, state_type: str) -> int:
if state_type == "ins":
return (position + 1) * 3 + 1
else:
return {"match": 0, "del": 1}[state_type] + 3 * position + 2
def build_profile_hmm(threshold: float, alphabet: List[str], alignment: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
= np.mean(alignment == "-", axis=0) < threshold
valid_columns = sum(valid_columns)
valid_column_count = valid_column_count * 3 + 2
end_state = create_transition_matrix(valid_column_count)
transition_probs = create_emission_matrix(valid_column_count, len(alphabet))
emission_probs
for sequence in alignment:
= 0
prev_index = -1
column_index for i, char in enumerate(sequence):
if valid_columns[i]:
+= 1
column_index if char == "-":
= calculate_state_index(column_index, "del")
current_index else:
= calculate_state_index(column_index, "match")
current_index += 1
transition_probs[prev_index, current_index] = current_index
prev_index else:
if char != "-":
= calculate_state_index(column_index, "ins")
current_index += 1
transition_probs[prev_index, current_index] = current_index
prev_index if char != "-":
+= 1
emission_probs[current_index, alphabet.index(char)] += 1
transition_probs[prev_index, end_state]
= normalize_matrix(transition_probs)
transition_probs = normalize_matrix(emission_probs)
emission_probs
return transition_probs, emission_probs
def parse_input(input_handle: Iterator[str]) -> Tuple[float, float, List[str], np.ndarray]:
= map(float, next(input_handle).rstrip().split())
threshold, pseudocount next(input_handle)
= next(input_handle).split()
alphabet next(input_handle)
= np.array([list(sequence.strip()) for sequence in input_handle])
alignment return threshold, pseudocount, alphabet, alignment
def add_transition_pseudocounts(transition_matrix: np.ndarray, pseudocount: float) -> np.ndarray:
= (transition_matrix.shape[0] - 3) // 3
n 0, 1:4] += pseudocount
transition_matrix[1, 1:4] += pseudocount
transition_matrix[for i in range(n):
* 3 + 2 : i * 3 + 5, (i + 1) * 3 + 1 : (i + 1) * 3 + 4] += pseudocount
transition_matrix[i return normalize_matrix(transition_matrix)
def add_emission_pseudocounts(emission_matrix: np.ndarray, pseudocount: float) -> np.ndarray:
= (emission_matrix.shape[0] - 3) // 3
n 1, :] += pseudocount
emission_matrix[for i in range(n):
* 3 + 2, :] += pseudocount
emission_matrix[i * 3 + 4, :] += pseudocount
emission_matrix[i return normalize_matrix(emission_matrix)
def build_pseudocount_profile_hmm(threshold: float, pseudocount: float, alphabet: List[str], alignment: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
= build_profile_hmm(threshold, alphabet, alignment)
transition_probs, emission_probs = add_transition_pseudocounts(transition_probs, pseudocount)
transition_probs = add_emission_pseudocounts(emission_probs, pseudocount)
emission_probs return transition_probs, emission_probs
= """
sample_input 0.358 0.01
--------
A B C D E
--------
ADA
ADA
AAA
ADC
-DA
D-A
"""
= iter(StringIO(sample_input.strip()).readlines())
input_lines = parse_input(input_lines)
threshold, pseudocount, alphabet, alignment = build_pseudocount_profile_hmm(threshold, pseudocount, alphabet, alignment)
transition_probs, emission_probs
print_transition_probs(transition_probs)print("--------")
print_emission_probs(emission_probs, alphabet)
110 Perform a Multiple Sequence Alignment with a Profile HMM
Given: A string Text, a multiple alignment Alignment, a threshold θ, and a pseudocount σ.
Return: An optimal hidden path emitting Text in HMM(Alignment,θ,σ).
110.1 Sample Dataset
AEFDFDC
--------
0.4 0.01
--------
A B C D E F
--------
ACDEFACADF
AFDA---CCF
A--EFD-FDC
ACAEF--A-C
ADDEFAAADF
110.2 Sample Output
M1 D2 D3 M4 M5 I5 M6 M7 M8
110.3 Solution
from io import StringIO
from collections import defaultdict
from typing import List, Tuple, Dict, Iterator
import numpy as np
from math import inf, log
def generate_state_labels(num_states: int) -> List[str]:
= ["S", "I0"]
labels for i in range(1, num_states + 1):
f"M{i}", f"D{i}", f"I{i}"])
labels.extend(["E")
labels.append(return labels
def normalize_row(row: np.ndarray, include_zeros: bool = False, minimum_value: float = 0.0) -> np.ndarray:
if include_zeros and sum(row) == 0:
= 1
row[:] with np.errstate(divide="ignore", invalid="ignore"):
= row / sum(row)
normalized == 0.0] = minimum_value
normalized[row return normalized
def normalize_matrix(matrix: np.ndarray, include_zeros: bool = False, minimum_value: float = 0.0) -> np.ndarray:
return np.array([normalize_row(r, include_zeros=include_zeros, minimum_value=minimum_value) for r in matrix])
def create_transition_matrix(num_states: int) -> np.ndarray:
return np.zeros((num_states * 3 + 3, num_states * 3 + 3), dtype=float)
def create_emission_matrix(num_states: int, num_symbols: int) -> np.ndarray:
return np.zeros((num_states * 3 + 3, num_symbols), dtype=float)
def calculate_index(state_num: int, state_type: str) -> int:
if state_type == "ins":
return (state_num + 1) * 3 + 1
else:
return {"match": 0, "del": 1}[state_type] + 3 * state_num + 2
def build_profile_hmm(threshold: float, alphabet: List[str], alignment: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
= np.mean(alignment == "-", axis=0) < threshold
valid_columns = sum(valid_columns)
valid_length = valid_length * 3 + 2
end_state = create_transition_matrix(valid_length)
transition_probs = create_emission_matrix(valid_length, len(alphabet))
emission_probs
for sequence in alignment:
= 0
prev_index = -1
valid_col_count for col, char in enumerate(sequence):
if valid_columns[col]:
+= 1
valid_col_count if char == "-":
= calculate_index(valid_col_count, "del")
current_index else:
= calculate_index(valid_col_count, "match")
current_index += 1
transition_probs[prev_index, current_index] = current_index
prev_index else:
if char != "-":
= calculate_index(valid_col_count, "ins")
current_index += 1
transition_probs[prev_index, current_index] = current_index
prev_index if char != "-":
+= 1
emission_probs[current_index, alphabet.index(char)] += 1
transition_probs[prev_index, end_state]
= normalize_matrix(transition_probs)
transition_probs = normalize_matrix(emission_probs)
emission_probs
return transition_probs, emission_probs
def add_pseudocounts_to_transitions(matrix: np.ndarray, pseudocount: float) -> np.ndarray:
= (matrix.shape[0] - 3) // 3
num_states 0, 1:4] += pseudocount
matrix[1, 1:4] += pseudocount
matrix[for i in range(num_states):
*3+2:i*3+5, (i+1)*3+1:(i+1)*3+4] += pseudocount
matrix[ireturn normalize_matrix(matrix)
def add_pseudocounts_to_emissions(matrix: np.ndarray, pseudocount: float) -> np.ndarray:
= (matrix.shape[0] - 3) // 3
num_states 1, :] += pseudocount
matrix[for i in range(num_states):
*3+2, :] += pseudocount
matrix[i*3+4, :] += pseudocount
matrix[ireturn normalize_matrix(matrix)
def build_profile_hmm_with_pseudocounts(threshold: float, pseudocount: float, alphabet: List[str], alignment: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
= build_profile_hmm(threshold, alphabet, alignment)
transition_probs, emission_probs = add_pseudocounts_to_transitions(transition_probs, pseudocount)
transition_probs = add_pseudocounts_to_emissions(emission_probs, pseudocount)
emission_probs return transition_probs, emission_probs
def parse_input_data(input_iterator: Iterator[str]) -> Tuple[str, float, float, List[str], np.ndarray]:
= next(input_iterator).rstrip()
sequence next(input_iterator)
= map(float, next(input_iterator).rstrip().split())
threshold, pseudocount next(input_iterator)
= next(input_iterator).split()
alphabet next(input_iterator)
= np.array([list(x.strip()) for x in input_iterator])
alignment return sequence, threshold, pseudocount, alphabet, alignment
def convert_transition_probs_to_dict(matrix: np.ndarray) -> Dict[Tuple[str, str], float]:
= defaultdict(float)
prob_dict = (matrix.shape[0] - 3) // 3
num_states = generate_state_labels(num_states)
labels for i in range(matrix.shape[0]):
for j in range(matrix.shape[0]):
= matrix[i][j]
prob_dict[labels[i], labels[j]] return prob_dict
def convert_emission_probs_to_dict(matrix: np.ndarray, alphabet: List[str]) -> Dict[Tuple[str, str], float]:
= defaultdict(float)
prob_dict = (matrix.shape[0] - 3) // 3
num_states = generate_state_labels(num_states)
labels for i in range(matrix.shape[0]):
for j, symbol in enumerate(alphabet):
= matrix[i][j]
prob_dict[labels[i], symbol] return prob_dict
def build_hmm_graph(transition_probs: Dict[Tuple[str, str], float], num_states: int) -> Dict[str, List[Dict[str, float]]]:
def add_edge(source: str, target: str) -> None:
"node": target, "weight": transition_probs[source, target]})
graph[source].append({
= defaultdict(list)
graph for target in ["I0", "M1", "D1"]:
"S", target)
add_edge(for i in range(num_states):
= f"I{i}"
source for target in [source, f"M{i+1}", f"D{i+1}"]:
add_edge(source, target)for i in range(1, num_states):
for source in [f"M{i}", f"D{i}"]:
for target in [f"M{i+1}", f"I{i}", f"D{i+1}"]:
add_edge(source, target)for source in [f"I{num_states}", f"M{num_states}", f"D{num_states}"]:
for target in [f"I{num_states}", "E"]:
add_edge(source, target)
return graph
def generate_topological_order(num_states: int, seq_length: int) -> Iterator[Tuple[str, int]]:
yield ("S", 0)
for j in range(num_states):
yield (f"D{j+1}", 0)
for i in range(seq_length):
yield ("I0", i + 1)
for j in range(num_states):
for state_type in ["M", "D", "I"]:
yield (f"{state_type}{j+1}", i + 1)
yield ("E", seq_length + 1)
def get_previous_nodes(current_node: str, current_col: int, num_states: int, seq_length: int) -> List[Tuple[str, int]]:
if current_node[0] == "E":
return [(f"D{num_states}", seq_length), (f"M{num_states}", seq_length), (f"I{num_states}", seq_length)]
= int(current_node[1:])
state_num if current_col == 0:
return [("S", 0)] if state_num == 1 else [(f"D{state_num-1}", 0)]
elif current_node == "I0":
return [("S", 0)] if current_col == 1 else [("I0", current_col - 1)]
elif current_node == "M1":
return [("S", 0)] if current_col == 1 else [("I0", current_col - 1)]
elif current_node[0] == "I":
return [(f"D{state_num}", 0)] if current_col == 1 else [(f"D{state_num}", current_col - 1), (f"M{state_num}", current_col - 1), (f"I{state_num}", current_col - 1)]
elif current_node[0] == "M":
return [(f"D{state_num-1}", 0)] if current_col == 1 else [(f"D{state_num-1}", current_col - 1), (f"M{state_num-1}", current_col - 1), (f"I{state_num-1}", current_col - 1)]
elif current_node[0] == "D":
return [("I0", current_col)] if state_num == 1 else [(f"D{state_num-1}", current_col), (f"M{state_num-1}", current_col), (f"I{state_num-1}", current_col)]
else:
print(f"Unhandled node: {current_node}")
return []
def simplify_graph(graph: Dict[str, List[Dict[str, float]]]) -> Dict[str, Dict[str, float]]:
return {k: {x["node"]: x["weight"] for x in v} for k, v in graph.items()}
# Main execution
def main(sample_input):
= iter(StringIO(sample_input.strip()).readlines())
input_lines = parse_input_data(input_lines)
sequence, threshold, pseudocount, alphabet, alignment = build_profile_hmm_with_pseudocounts(threshold, pseudocount, alphabet, alignment)
transition_probs, emission_probs = (transition_probs.shape[0] - 3) // 3
num_states = convert_transition_probs_to_dict(transition_probs)
transition_probs_dict = convert_emission_probs_to_dict(emission_probs, alphabet)
emission_probs_dict
= build_hmm_graph(transition_probs_dict, num_states)
graph = generate_topological_order(num_states, len(sequence))
topological_order = simplify_graph(graph)
simplified_graph
# Dynamic programming to find the most probable path
= next(topological_order)
previous_node = {previous_node: 0}
scores = {previous_node: (None, None)}
backpointers
for current_node, current_col in topological_order:
= 0
backpointers[(current_node, current_col)] = -inf
scores[(current_node, current_col)] for prev_node, prev_col in get_previous_nodes(current_node, current_col, num_states, len(sequence)):
if prev_col < current_col and current_node != "E":
= emission_probs_dict[current_node, sequence[current_col - 1]]
emission_prob else:
= 1
emission_prob = log(simplified_graph[prev_node][current_node]) + log(emission_prob) + scores[(prev_node, prev_col)]
log_prob if log_prob > scores[(current_node, current_col)]:
= log_prob
scores[(current_node, current_col)] = (prev_node, prev_col)
backpointers[(current_node, current_col)]
# Traceback to find the path
= []
path = ("E", len(sequence) + 1)
position while position[0]:
0])
path.append(backpointers[position][= backpointers[position]
position
print(*path[::-1][2:])
= """
sample_input AEFDFDC
--------
0.4 0.01
--------
A B C D E F
--------
ACDEFACADF
AFDA---CCF
A--EFD-FDC
ACAEF--A-C
ADDEFAAADF
"""
main(sample_input)
111 Estimate the Parameters of an HMM
Given: A sequence of emitted symbols x = x1… xn in an alphabet ∑ and a path \(π = π_1... π_n\) generated by a k-state HMM with unknown transition and emission probabilities.
Return: A matrix of transition probabilities Transition and a matrix of emission probabilities Emission that maximize \(Pr(x,π)\) over all possible matrices of transition and emission probabilities.
111.1 Sample Dataset
yzzzyxzxxx
--------
x y z
--------
BBABABABAB
--------
A B C
111.2 Sample Output
A B C
A 0.0 1.0 0.0
B 0.8 0.2 0.0
C 0.333 0.333 0.333
--------
x y z
A 0.25 0.25 0.5
B 0.5 0.167 0.333
C 0.333 0.333 0.333
111.3 Solution
from io import StringIO
from collections import defaultdict
from typing import List, Tuple, Dict, Iterator, Union
import numpy as np
def normalize_row(row: np.ndarray, include_zeros: bool = False, minimum_value: float = 0.0) -> np.ndarray:
if include_zeros and sum(row) == 0:
= 1
row[:] with np.errstate(divide="ignore", invalid="ignore"):
= row / sum(row)
normalized == 0.0] = minimum_value
normalized[row return normalized
def normalize_matrix(matrix: np.ndarray, include_zeros: bool = False, minimum_value: float = 0.0) -> np.ndarray:
return np.array([normalize_row(r, include_zeros=include_zeros, minimum_value=minimum_value) for r in matrix])
def print_matrix(matrix: np.ndarray, row_labels: List[str], column_labels: List[str]) -> None:
print(*column_labels, sep="\t")
for i, row in enumerate(matrix):
= [row_labels[i]] + [round(x, 3) if x > 0.0 else "0" for x in row]
r print(*r, sep="\t")
def parse_input(handle: Iterator[str]) -> Tuple[str, List[str], str, List[str]]:
= next(handle).rstrip()
sequence next(handle)
= next(handle).split()
alphabet next(handle)
= next(handle).rstrip()
path next(handle)
= next(handle).split()
states return sequence, alphabet, path, states
def convert_to_dict(matrix: np.ndarray, row_labels: List[str], column_labels: List[str]) -> Dict[Tuple[str, str], float]:
= defaultdict(float)
result for i in range(matrix.shape[0]):
for j in range(matrix.shape[1]):
= matrix[i][j]
result[row_labels[i], column_labels[j]] return result
def estimate_transition_matrix(path: str, states: List[str], to_dict: bool = False) -> Union[np.ndarray, Dict[Tuple[str, str], float]]:
= np.zeros((len(states), len(states)), dtype=float)
transition_matrix for current_state, next_state in zip(path, path[1:]):
+= 1
transition_matrix[states.index(current_state)][states.index(next_state)] = normalize_matrix(transition_matrix, include_zeros=True, minimum_value=1e-16)
transition_matrix if to_dict:
return convert_to_dict(transition_matrix, states, states)
else:
return transition_matrix
def estimate_emission_matrix(sequence: str, alphabet: List[str], path: str, states: List[str], to_dict: bool = False) -> Union[np.ndarray, Dict[Tuple[str, str], float]]:
= np.zeros((len(states), len(alphabet)), dtype=float)
emission_matrix for state, symbol in zip(path, sequence):
+= 1
emission_matrix[states.index(state)][alphabet.index(symbol)] = normalize_matrix(emission_matrix, include_zeros=True, minimum_value=1e-16)
emission_matrix if to_dict:
return convert_to_dict(emission_matrix, states, alphabet)
else:
return emission_matrix
def main(sample_input: str) -> None:
= iter(StringIO(sample_input.strip()).readlines())
input_lines = parse_input(input_lines)
sequence, alphabet, path, states = estimate_transition_matrix(path, states)
transition_matrix = estimate_emission_matrix(sequence, alphabet, path, states)
emission_matrix
print_matrix(transition_matrix, states, states)print("--------")
print_matrix(emission_matrix, states, alphabet)
= """
sample_input yzzzyxzxxx
--------
x y z
--------
BBABABABAB
--------
A B C
"""
main(sample_input)
112 Implement Viterbi Learning
Given: A sequence of emitted symbols \(x=x_1... x_n\) in an alphabet A, generated by a k-state HMM with unknown transition and emission probabilities, initial Transition and Emission matrices and a number of iterations i.
Return: A matrix of transition probabilities Transition and a matrix of emission probabilities Emission that maximizes \(Pr(x,π)\) over all possible transition and emission matrices and over all hidden paths π.
112.1 Sample Dataset
100
--------
xxxzyzzxxzxyzxzxyxxzyzyzyyyyzzxxxzzxzyzzzxyxzzzxyzzxxxxzzzxyyxzzzzzyzzzxxzzxxxyxyzzyxzxxxyxzyxxyzyxz
--------
x y z
--------
A B
--------
A B
A 0.582 0.418
B 0.272 0.728
--------
x y z
A 0.129 0.35 0.52
B 0.422 0.151 0.426
112.2 Sample Output
A B
A 0.875 0.125
B 0.011 0.989
--------
x y z
A 0.0 0.75 0.25
B 0.402 0.174 0.424
112.3 Solution
from io import StringIO
from typing import List, Dict, Tuple
import numpy as np
from math import log
def viterbi(sequence: str, states: List[str], transition_matrix: Dict[Tuple[str, str], float], emission_matrix: Dict[Tuple[str, str], float]) -> str:
= np.zeros((len(sequence), len(states)))
mat = np.zeros((len(sequence), len(states)), dtype=int)
ptr
for i, state in enumerate(states):
0, i] = log(emission_matrix[state, sequence[0]] / len(states))
mat[
for i, emission in enumerate(sequence[1:], start=1):
for j, state in enumerate(states):
= [
opt + log(emission_matrix[state, emission]) + mat[i - 1, k]
log(transition_matrix[prev, state]) for k, prev in enumerate(states)
]= opt.index(max(opt))
p = p
ptr[i, j] = max(opt)
mat[i, j] = np.argmax(mat[i, :])
ind
= states[ind]
state_sequence while i > 0:
= states[ptr[i, ind]] + state_sequence
state_sequence = ptr[i, ind]
ind -= 1
i return state_sequence
def print_matrix(matrix: np.ndarray, row_labels: List[str], column_labels: List[str]) -> None:
print(*column_labels, sep="\t")
for i, row in enumerate(matrix):
= [row_labels[i]] + [round(x, 3) if x > 0.0 else "0" for x in row]
r print(*r, sep="\t")
def normalize_matrix(matrix: np.ndarray, include_zeros: bool = True, min_val: float = 1e-16) -> np.ndarray:
= matrix / matrix.sum(axis=1, keepdims=True)
normalized if include_zeros:
== 0] = min_val
normalized[normalized return normalized
def estimate_transition_matrix(path: str, states: List[str], to_dict: bool = False) -> Dict[Tuple[str, str], float]:
= np.zeros((len(states), len(states)), dtype=float)
tmat for a, b in zip(path, path[1:]):
+= 1
tmat[states.index(a)][states.index(b)] = normalize_matrix(tmat)
tmat if to_dict:
return {(states[i], states[j]): tmat[i, j] for i in range(len(states)) for j in range(len(states))}
return tmat
def estimate_emission_matrix(sequence: str, alphabet: List[str], path: str, states: List[str], to_dict: bool = False) -> Dict[Tuple[str, str], float]:
= np.zeros((len(states), len(alphabet)), dtype=float)
emat for a, b in zip(path, sequence):
+= 1
emat[states.index(a)][alphabet.index(b)] = normalize_matrix(emat)
emat if to_dict:
return {(states[i], alphabet[j]): emat[i, j] for i in range(len(states)) for j in range(len(alphabet))}
return emat
def print_dict(d: Dict[Tuple[str, str], float], row_labels: List[str], column_labels: List[str]) -> None:
= np.zeros((len(row_labels), len(column_labels)), dtype=float)
mat for i, r in enumerate(row_labels):
for j, c in enumerate(column_labels):
= d[r, c]
mat[i, j]
print_matrix(mat, row_labels, column_labels)
def parse_input(handle: StringIO) -> Tuple[int, str, List[str], List[str], Dict[Tuple[str, str], float], Dict[Tuple[str, str], float]]:
= int(next(handle).rstrip())
niter next(handle)
= next(handle).rstrip()
sequence next(handle)
= next(handle).split()
alphabet next(handle)
= next(handle).split()
states next(handle)
= [next(handle) for _ in range(len(states) + 1)]
lines = {
transition_matrix float(v)
(states[i], states[j]): for i, x in enumerate(lines[1:])
for j, v in enumerate(x.split()[1:])
}next(handle)
= [next(handle) for i in range(len(states) + 1)]
lines = {
emission_matrix float(v)
(states[i], alphabet[j]): for i, x in enumerate(lines[1:])
for j, v in enumerate(x.split()[1:])
}return niter, sequence, states, alphabet, transition_matrix, emission_matrix
def main(sample_input: str) -> None:
= StringIO(sample_input.strip())
input_lines = parse_input(input_lines)
niter, sequence, states, alphabet, transition_matrix, emission_matrix for _ in range(niter):
= viterbi(sequence, states, transition_matrix, emission_matrix)
path = estimate_transition_matrix(path, states, to_dict=True)
transition_matrix = estimate_emission_matrix(sequence, alphabet, path, states, to_dict=True)
emission_matrix
print_dict(transition_matrix, states, states)print("--------")
print_dict(emission_matrix, states, alphabet)
= """
sample_input 100
--------
xxxzyzzxxzxyzxzxyxxzyzyzyyyyzzxxxzzxzyzzzxyxzzzxyzzxxxxzzzxyyxzzzzzyzzzxxzzxxxyxyzzyxzxxxyxzyxxyzyxz
--------
x y z
--------
A B
--------
A B
A 0.582 0.418
B 0.272 0.728
--------
x y z
A 0.129 0.35 0.52
B 0.422 0.151 0.426
"""
main(sample_input)
113 Solve the Soft Decoding Problem
Given: A string x, followed by the alphabet Σ from which x was constructed, followed by the states States, transition matrix Transition, and emission matrix Emission of an HMM (Σ, States, Transition, Emission).
Return: The probability \(Pr(π_i=k|x)\) that the HMM was in state k at step i (for each state k and each step i).
113.1 Sample Dataset
zyxxxxyxzz
--------
x y z
--------
A B
--------
A B
A 0.911 0.089
B 0.228 0.772
--------
x y z
A 0.356 0.191 0.453
B 0.04 0.467 0.493
113.2 Sample Output
A B
0.5438 0.4562
0.6492 0.3508
0.9647 0.0353
0.9936 0.0064
0.9957 0.0043
0.9891 0.0109
0.9154 0.0846
0.964 0.036
0.8737 0.1263
0.8167 0.1833
113.3 Solution
from typing import List, Dict, Tuple, Iterator
from io import StringIO
import numpy as np
def parse_input(handle: Iterator[str]) -> Tuple[str, List[str], Dict[Tuple[str, str], float], Dict[Tuple[str, str], float]]:
str = next(handle).rstrip()
seq: next(handle)
str] = next(handle).split()
alphabet: List[next(handle)
str] = next(handle).split()
states: List[next(handle)
str] = [next(handle) for _ in range(len(states) + 1)]
lines: List[str, str], float] = {
tmat: Dict[Tuple[float(v)
(states[i], states[j]): for i, x in enumerate(lines[1:])
for j, v in enumerate(x.split()[1:])
}next(handle)
= [next(handle) for i in range(len(states) + 1)]
lines str, str], float] = {
emat: Dict[Tuple[float(v)
(states[i], alphabet[j]): for i, x in enumerate(lines[1:])
for j, v in enumerate(x.split()[1:])
}return seq, states, tmat, emat
def forward(seq: str, states: List[str], tmat: Dict[Tuple[str, str], float], emat: Dict[Tuple[str, str], float]) -> np.ndarray:
= np.ones((len(seq), len(states)))
mat: np.ndarray
for i, state in enumerate(states):
0, i] = emat[state, seq[0]]
mat[for i, emission in enumerate(seq[1:], start=1):
for j, state in enumerate(states):
= sum(
mat[i, j] * emat[state, emission] * mat[i - 1, k]
tmat[prev, state] for k, prev in enumerate(states)
)
return mat
def backward(seq: str, states: List[str], tmat: Dict[Tuple[str, str], float], emat: Dict[Tuple[str, str], float]) -> np.ndarray:
= np.ones((len(seq), len(states)))
mat: np.ndarray
for i, emission in enumerate(seq[::-1][:-1], start=1):
for j, state in enumerate(states):
len(seq) - i - 1, j] = sum(
mat[* emat[prev, emission] * mat[len(seq) - i, k]
tmat[state, prev] for k, prev in enumerate(states)
)return mat
def soft_decode(seq: str, states: List[str], tmat: Dict[Tuple[str, str], float], emat: Dict[Tuple[str, str], float], normalise: bool = True) -> np.ndarray:
= forward(seq, states, tmat, emat) * backward(seq, states, tmat, emat)
tot: np.ndarray if normalise:
= tot / np.sum(tot, axis=1, keepdims=True)
tot return tot
def main(sample_input: str) -> None:
str] = StringIO(sample_input.strip())
input_lines: Iterator[= parse_input(input_lines)
seq, states, tmat, emat = soft_decode(seq, states, tmat, emat)
tot: np.ndarray print(*states, sep="\t")
for r in np.round(tot, 4):
print(*r, sep="\t")
str = """
sample_input: zyxxxxyxzz
--------
x y z
--------
A B
--------
A B
A 0.911 0.089
B 0.228 0.772
--------
x y z
A 0.356 0.191 0.453
B 0.04 0.467 0.493
"""
main(sample_input)
114 Implement Baum-Welch Learning
Given: A sequence of emitted symbols \(x=x_1...x_n\) in an alphabet A, generated by a k-state HMM with unknown transition and emission probabilities, initial Transition and Emission matrices and a number of iterations I.
Return: A matrix of transition probabilities Transition and a matrix of emission probabilities Emission that maximizes \(Pr(x,π)\) over all possible transition and emission matrices and over all hidden paths π.
114.1 Sample Dataset
10
--------
xzyyzyzyxy
--------
x y z
--------
A B
--------
A B
A 0.019 0.981
B 0.668 0.332
--------
x y z
A 0.175 0.003 0.821
B 0.196 0.512 0.293
114.2 Sample Output
A B
A 0.000 1.000
B 0.786 0.214
--------
x y z
A 0.242 0.000 0.758
B 0.172 0.828 0.000
114.3 Solution
from typing import List, Dict, Tuple, Iterator
from io import StringIO
import numpy as np
from collections import defaultdict
def print_matrix(matrix: np.ndarray, row_labels: List[str], col_labels: List[str]) -> None:
print(*col_labels, sep="\t")
for i, row in enumerate(matrix):
= [row_labels[i]] + [round(x, 3) if x > 0.0 else "0" for x in row]
r print(*r, sep="\t")
def print_dict(d: Dict[Tuple[str, str], float], row_labels: List[str], col_labels: List[str]) -> None:
= np.zeros((len(row_labels), len(col_labels)), dtype=float)
mat for i, r in enumerate(row_labels):
for j, c in enumerate(col_labels):
= d[r, c]
mat[i, j]
print_matrix(mat, row_labels, col_labels)
def parse_input(handle: Iterator[str]) -> Tuple[int, str, List[str], List[str], Dict[Tuple[str, str], float], Dict[Tuple[str, str], float]]:
= int(next(handle).rstrip())
niter next(handle)
= next(handle).rstrip()
seq next(handle)
= next(handle).split()
alphabet next(handle)
= next(handle).split()
states next(handle)
= [next(handle) for _ in range(len(states) + 1)]
lines = {
tmat float(v)
(states[i], states[j]): for i, x in enumerate(lines[1:])
for j, v in enumerate(x.split()[1:])
}next(handle)
= [next(handle) for _ in range(len(states) + 1)]
lines = {
emat float(v)
(states[i], alphabet[j]): for i, x in enumerate(lines[1:])
for j, v in enumerate(x.split()[1:])
}return niter, seq, states, alphabet, tmat, emat
def forward(seq: str, states: List[str], tmat: Dict[Tuple[str, str], float], emat: Dict[Tuple[str, str], float]) -> np.ndarray:
= np.ones((len(seq), len(states)))
mat for i, state in enumerate(states):
0, i] = emat[state, seq[0]]
mat[for i, emission in enumerate(seq[1:], start=1):
for j, state in enumerate(states):
= sum(
mat[i, j] * emat[state, emission] * mat[i - 1, k]
tmat[prev, state] for k, prev in enumerate(states)
)return mat
def backward(seq: str, states: List[str], tmat: Dict[Tuple[str, str], float], emat: Dict[Tuple[str, str], float]) -> np.ndarray:
= np.ones((len(seq), len(states)))
mat for i, emission in enumerate(seq[::-1][:-1], start=1):
for j, state in enumerate(states):
len(seq) - i - 1, j] = sum(
mat[* emat[prev, emission] * mat[len(seq) - i, k]
tmat[state, prev] for k, prev in enumerate(states)
)return mat
def soft_decode(seq: str, states: List[str], tmat: Dict[Tuple[str, str], float], emat: Dict[Tuple[str, str], float], normalise: bool = True) -> np.ndarray:
= forward(seq, states, tmat, emat) * backward(seq, states, tmat, emat)
tot if normalise:
= tot / np.sum(tot, axis=1, keepdims=True)
tot return tot
def as_dict(x: np.ndarray, r: List[str], c: List[str]) -> Dict[Tuple[str, str], float]:
= defaultdict(float)
g for i in range(x.shape[0]):
for j in range(x.shape[1]):
= x[i][j]
g[r[i], c[j]] return g
def estimate_pi2(seq: str, fwd: np.ndarray, bak: np.ndarray, tmat: Dict[Tuple[str, str], float], emat: Dict[Tuple[str, str], float], states: List[str]) -> np.ndarray:
= np.zeros((fwd.shape[0] - 1, len(states), len(states)), dtype=float)
rep_mat for i in range(0, fwd.shape[0] - 1):
for j, s1 in enumerate(states):
for k, s2 in enumerate(states):
= tmat[s1, s2] * emat[s2, seq[i + 1]]
weight = (
rep_mat[i, j, k] * bak[i + 1, k] * weight / sum(fwd[i, :] * bak[i, :])
fwd[i, j]
)return rep_mat
def estimate_tmat(seq: str, st: List[str], tmat: Dict[Tuple[str, str], float], emat: Dict[Tuple[str, str], float]) -> Dict[Tuple[str, str], float]:
= forward(seq, st, tmat, emat)
fwd = backward(seq, st, tmat, emat)
bak = estimate_pi2(seq, fwd, bak, tmat, emat, st)
pi2 = np.sum(pi2, 0)
tmat_new = tmat_new / np.sum(tmat_new, axis=1, keepdims=True)
tmat_new return as_dict(tmat_new, st, st)
def estimate_emat(seq: str, al: List[str], st: List[str], tmat: Dict[Tuple[str, str], float], emat: Dict[Tuple[str, str], float]) -> Dict[Tuple[str, str], float]:
= soft_decode(seq, st, tmat, emat)
pi1 = np.zeros((len(st), len(al)), dtype=float)
emat_new for i, emission in enumerate(al):
= np.array(list(seq)) == emission
ind = np.sum(pi1[ind, :], 0)
emat_new[:, i] = emat_new / np.sum(emat_new, axis=1, keepdims=True)
emat_new return as_dict(emat_new, st, al)
def main(sample_input: str) -> None:
= StringIO(sample_input.strip())
input_lines = parse_input(input_lines)
niter, seq, st, al, tmat, emat for _ in range(niter):
= estimate_tmat(seq, st, tmat, emat)
tmat2 = estimate_emat(seq, al, st, tmat, emat)
emat2 = emat2, tmat2
emat, tmat
print_dict(tmat, st, st)print("--------")
print_dict(emat, st, al)
str = """
sample_input: 10
--------
xzyyzyzyxy
--------
x y z
--------
A B
--------
A B
A 0.019 0.981
B 0.668 0.332
--------
x y z
A 0.175 0.003 0.821
B 0.196 0.512 0.293
"""
main(sample_input)
115 Construct the Graph of a Spectrum
Spectrum Graph Construction. Construct the graph of a spectrum.
Given: A space-delimited list of integers Spectrum.
Return: Graph(Spectrum).
Note: In this chapter, all dataset problems implicitly use the standard integer-valued mass table for the regular twenty amino acids. Examples sometimes use imaginary amino acids X and Z having respective integer masses 4 and 5.
115.1 Sample Dataset
57 71 154 185 301 332 415 429 486
115.2 Sample Output
0->57:G
0->71:A
57->154:P
57->185:K
71->185:N
154->301:F
185->332:F
301->415:N
301->429:K
332->429:P
415->486:A
429->486:G
115.3 Solution
from collections import defaultdict
from typing import List, Dict, DefaultDict
# Amino acid weights dictionary
str, int] = {
amino_acid_weights: Dict['G': 57, 'A': 71, 'S': 87, 'P': 97, 'V': 99,
'T': 101, 'C': 103, 'I': 113, 'L': 113, 'N': 114,
'D': 115, 'K': 128, 'Q': 128, 'E': 129, 'M': 131,
'H': 137, 'F': 147, 'R': 156, 'Y': 163, 'W': 186
}
def spectrum_graph(masses: List[int]) -> DefaultDict[int, List[Dict[str, int]]]:
# Reverse mapping of weights to amino acids
int, str] = {weight: aa for aa, weight in amino_acid_weights.items()}
weight_to_amino_acid: Dict[
int, List[Dict[str, int]]] = defaultdict(list)
graph: DefaultDict[
# Create graph based on mass differences
for i in range(len(masses)):
for j in range(i + 1, len(masses)):
int = masses[j] - masses[i]
difference: if difference in weight_to_amino_acid:
"n": masses[j], "l": weight_to_amino_acid[difference]})
graph[masses[i]].append({
return graph
# Sample input
str = "57 71 154 185 301 332 415 429 486"
sample_input: int] = [0] + list(map(int, sample_input.split()))
masses: List[
# Print the spectrum graph
for start_mass, edges in spectrum_graph(masses).items():
for edge in edges:
print(f"{start_mass}->{edge['n']}:{edge['l']}")
116 Implement DecodingIdealSpectrum
Decoding an Ideal Spectrum Problem. Reconstruct a peptide from its ideal spectrum.
Given: A space-delimited list of integers, Spectrum.
Return: An amino acid string with an ideal spectrum that matches Spectrum.
Note: In this chapter, all dataset problems implicitly use the standard integer-valued mass table for the regular twenty amino acids. Examples sometimes use imaginary amino acids X and Z having respective integer masses 4 and 5.
116.1 Sample Dataset
57 71 154 185 301 332 415 429 486
116.2 Sample Output
GPFNA
116.3 Solution
from typing import Dict, List, Tuple
str, int] = {
amino_acid_masses: Dict['A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131,
'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163
}int, str] = {v: k for k, v in amino_acid_masses.items()}
mass_to_amino_acid: Dict[
def create_spectrum_graph(spectrum: List[int]) -> List[List[int]]:
int]] = []
adjacency_list: List[List[for i in range(len(spectrum)):
for j in range(i, len(spectrum)):
if spectrum[j] - spectrum[i] in mass_to_amino_acid.keys():
- spectrum[i]]])
adjacency_list.append([spectrum[i], spectrum[j], mass_to_amino_acid[spectrum[j] return adjacency_list
def calculate_ideal_spectrum(peptide: str) -> List[int]:
int] = [0]
prefix_mass: List[for i in range(len(peptide)):
= prefix_mass[i] + amino_acid_masses[peptide[i]]
temp
prefix_mass.append(temp)int] = [0]
linear_spectrum: List[for i in range(len(peptide)):
for j in range(i + 1, len(peptide) + 1):
- prefix_mass[i])
linear_spectrum.append(prefix_mass[j]
linear_spectrum.sort()return linear_spectrum
def find_paths(adjacency_list: List[List[int]]) -> List[str]:
int = 0
node: str] = []
peptide_list: List[int]]] = []
tmp_edges: List[List[List[str = ''
peptide: str] = []
tmp_peps: List[
while any([len(x) != 0 for x in tmp_edges]) or len(tmp_edges) == 0:
int]] = [e for e in adjacency_list if e[0] == node]
next_edges: List[List[if len(next_edges) > 1:
= next_edges[1:]
tmp
tmp_edges.append(tmp)
tmp_peps.append(peptide)
= next_edges[0]
next_edge += next_edge[2]
peptide = next_edge[1]
node
if len([e for e in adjacency_list if e[0] == node]) == 0:
= [x for x in tmp_edges if len(x) != 0][-1]
tmp = tmp.pop()
next_edge = next_edge[1]
node
peptide_list.append(peptide)= tmp_peps.pop()
tmp_pep = tmp_pep + next_edge[2]
peptide
return peptide_list
def decode_ideal_spectrum(spectrum: List[int]) -> str:
= create_spectrum_graph(spectrum)
adjacency_list = find_paths(adjacency_list)
all_paths for peptide in all_paths:
if set(spectrum).issubset(calculate_ideal_spectrum(peptide)):
return peptide
str = """
sample_input: 57 71 154 185 301 332 415 429 486
"""
int] = [int(s) for s in sample_input.strip().split()]
spectrum: List[= [0] + spectrum
spectrum
print(decode_ideal_spectrum(spectrum))
117 Convert a Peptide into a Peptide Vector
Given: A peptide P.
Return: The peptide vector of P.
Note: In this chapter, all dataset problems implicitly use the standard integer-valued mass table for the regular twenty amino acids. Examples sometimes use imaginary amino acids X and Z having respective integer masses 4 and 5.
117.1 Sample Dataset
XZZXX
117.2 Sample Output
0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1
117.3 Solution
from typing import Dict, List
str, int] = {
amino_acid_masses: Dict['A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131, 'L': 113,
'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163, 'X': 4, 'Z': 5
}
def create_peptide_vector(peptide: str) -> List[int]:
int] = []
prefix_masses: List[for i in range(len(peptide)):
= peptide[:i+1]
prefix = sum(amino_acid_masses[aa] for aa in prefix)
mass
prefix_masses.append(mass)
int] = [0] * prefix_masses[-1]
vector: List[for mass in prefix_masses:
- 1] = 1
vector[mass return vector
str = """
sample_input: XZZXX
"""
str = sample_input.strip()
peptide: print(' '.join(str(x) for x in create_peptide_vector(peptide)))
118 Convert a Peptide Vector into a Peptide
Given: A space-delimited binary vector P.
Return: A peptide whose binary peptide vector matches P. For masses with more than one amino acid, any choice may be used.
Note: In this chapter, all dataset problems implicitly use the standard integer-valued mass table for the regular twenty amino acids. Examples sometimes use imaginary amino acids X and Z having respective integer masses 4 and 5.
118.1 Sample Dataset
0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1
118.2 Sample Output
XZZXX
118.3 Solution
from typing import Dict, List
str, int] = {
amino_acid_masses: Dict['A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131,
'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163
}int, str] = {v: k for k, v in amino_acid_masses.items()}
mass_to_amino_acid: Dict[4] = 'X'
mass_to_amino_acid[5] = 'Z'
mass_to_amino_acid[
def convert_peptide_vector_to_sequence(vector: List[int]) -> str:
int] = [i + 1 for i, v in enumerate(vector) if v == 1]
prefix_masses: List[
str = mass_to_amino_acid[prefix_masses[0]]
peptide: for i in range(1, len(prefix_masses)):
= prefix_masses[i] - prefix_masses[i - 1]
mass += mass_to_amino_acid[mass]
peptide
return peptide
str = """
sample_input: 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1
"""
int] = [int(x) for x in sample_input.strip().split()]
vector: List[
print(convert_peptide_vector_to_sequence(vector))
119 Sequence a Peptide
Peptide Sequencing Problem. Given a spectral vector S, find a peptide vector with maximum score against S.
Given: A space-delimited spectral vector S.
Return: A peptide with maximum score against S. For masses with more than one amino acid, any choice may be used.
Note: In this chapter, all dataset problems implicitly use the standard integer-valued mass table for the regular twenty amino acids. Examples sometimes use imaginary amino acids X and Z having respective integer masses 4 and 5.
119.1 Sample Dataset
0 0 0 4 -2 -3 -1 -7 6 5 3 2 1 9 3 -8 0 3 1 2 1 0
119.2 Sample Output
XZZXX
119.3 Solution
from typing import Dict, List, Tuple
str, int] = {
amino_acid_masses: Dict['A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131,
'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163
}int, str] = {v: k for k, v in amino_acid_masses.items()}
mass_to_amino_acid: Dict[
def sequence_peptide(spectral_vector: List[int]) -> str:
= [0] + spectral_vector
spectral_vector
int]] = []
adjacency_list: List[List[for i in range(len(spectral_vector)):
for j in range(i, len(spectral_vector)):
if (j - i) in mass_to_amino_acid.keys():
adjacency_list.append([i, j])
int, List[List[Union[int, str]]]] = {}
adjacency_dict: Dict[for i in range(len(spectral_vector)):
for j in range(i, len(spectral_vector)):
if (j - i) in mass_to_amino_acid.keys():
= [i, mass_to_amino_acid[j - i]]
edge if j not in adjacency_dict:
= [edge]
adjacency_dict[j] else:
adjacency_dict[j].append(edge)
int, List[Union[float, str]]] = {0: [0, '-']}
scores: Dict[for node in adjacency_dict.keys():
= [-float('inf'), '-']
scores[node] = adjacency_dict[node]
edges for edge in edges:
if edge[0] != 0:
0]] = [-float('inf'), '-']
scores[edge[
for node in adjacency_dict.keys():
float = -float('inf')
max_score: str, List[Union[int, str]]] = '-'
best_edge: Union[for parent in adjacency_dict[node]:
= scores[parent[0]][0]
score if score > max_score:
= score
max_score = parent
best_edge = [max_score + spectral_vector[node], best_edge]
scores[node]
int = list(scores.keys())[-1]
node: str = ''
peptide: while node != 0:
= scores[node][1][1] + peptide
peptide = scores[node][1][0]
node
return peptide
str = """
sample_input: 0 0 0 4 -2 -3 -1 -7 6 5 3 2 1 9 3 -8 0 3 1 2 1 0
"""
int] = [int(x) for x in sample_input.split()]
spectral_vector: List[
print(sequence_peptide(spectral_vector))
120 Find a Highest-Scoring Peptide in a Proteome against a Spectrum
Peptide Identification Problem. Find a peptide from a proteome with maximum score against a spectrum.
Given: A space-delimited spectral vector S and an amino acid string Proteome.
Return: A peptide in Proteome with maximum score against S.
Note: In this chapter, all dataset problems implicitly use the standard integer-valued mass table for the regular twenty amino acids. Examples sometimes use imaginary amino acids X and Z having respective integer masses 4 and 5.
120.1 Sample Dataset
0 0 0 4 -2 -3 -1 -7 6 5 3 2 1 9 3 -8 0 3 1 2 1 8
XZZXZXXXZXZZXZXXZ
120.2 Sample Output
ZXZXX
120.3 Solution
from typing import Dict, List
str, int] = {
amino_acid_masses: Dict['A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131,
'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163,
'X': 4, 'Z': 5 # Added 'X' and 'Z' with arbitrary masses
}
def create_peptide_vector(peptide: str) -> List[int]:
int] = [0]
prefix_masses: List[for amino_acid in peptide:
-1] + amino_acid_masses[amino_acid])
prefix_masses.append(prefix_masses[int] = [0] * prefix_masses[-1]
vector: List[for mass in prefix_masses[1:]:
- 1] = 1
vector[mass return vector
def identify_peptide(spectral_vector: List[int], proteome: str) -> str:
float = float('-inf')
max_score: str = ""
best_peptide:
for i in range(len(proteome)):
for j in range(i + 1, len(proteome) + 1):
str = proteome[i:j]
peptide: int] = create_peptide_vector(peptide)
peptide_vector: List[
if len(peptide_vector) > len(spectral_vector):
break
if len(peptide_vector) == len(spectral_vector):
float = sum(s * v for s, v in zip(spectral_vector, peptide_vector))
score: if score > max_score:
= score
max_score = peptide
best_peptide
return best_peptide
str = """
sample_input: 0 0 0 4 -2 -3 -1 -7 6 5 3 2 1 9 3 -8 0 3 1 2 1 8
XZZXZXXXZXZZXZXXZ
"""
str] = sample_input.strip().split("\n")
input_lines: List[int] = [int(x) for x in input_lines[0].strip().split()]
spectral_vector: List[str = input_lines[1].strip()
proteome: print(identify_peptide(spectral_vector, proteome))
121 Implement PSMSearch
PSM Search Problem. Identify Peptide-Spectrum Matches by matching spectra against a proteome.
Given: A set of space-delimited spectral vectors SpectralVectors, an amino acid string Proteome, and a score threshold T.
Return: All unique Peptide-Spectrum Matches scoring at least as high as T.
Note: For this chapter, all dataset problems implicitly use the standard integer-valued mass table for the regular twenty amino acids. Examples sometimes use imaginary amino acids X: 4, Z: 5.
121.1 Sample Dataset
-1 5 -4 5 3 -1 -4 5 -1 0 0 4 -1 0 1 4 4 4
-4 2 -2 -4 4 -5 -1 4 -1 2 5 -3 -1 3 2 -3
XXXZXZXXZXZXXXZXXZX
5
121.2 Sample Output
XZXZ
121.3 Solution
from typing import Dict, List, Set, Tuple
str, int] = {
amino_acid_masses: Dict['A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131,
'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163,
'X': 4, 'Z': 5 # Added 'X' and 'Z' with arbitrary masses
}
def is_number(n: str) -> bool:
try:
float(n)
except ValueError:
return False
return True
def create_peptide_vector(peptide: str) -> List[int]:
int] = [0]
prefix_masses: List[for amino_acid in peptide:
-1] + amino_acid_masses[amino_acid])
prefix_masses.append(prefix_masses[int] = [0] * prefix_masses[-1]
vector: List[for mass in prefix_masses[1:]:
- 1] = 1
vector[mass return vector
def identify_peptide(spectral_vector: List[int], proteome: str) -> Tuple[str, float]:
float = float('-inf')
max_score: str = ''
best_peptide:
for i in range(len(proteome)):
for j in range(i + 1, len(proteome) + 1):
str = proteome[i:j]
peptide: int] = create_peptide_vector(peptide)
peptide_vector: List[
if len(peptide_vector) > len(spectral_vector):
break
if len(peptide_vector) == len(spectral_vector):
float = sum(s * v for s, v in zip(spectral_vector, peptide_vector))
score: if score > max_score:
= score
max_score = peptide
best_peptide
return best_peptide, max_score
def search_peptide_spectrum_matches(spectral_vectors: List[List[int]], proteome: str, threshold: float) -> Set[str]:
str] = set()
psm_set: Set[for vector in spectral_vectors:
= identify_peptide(vector, proteome)
peptide, score if score >= threshold:
psm_set.add(peptide)return psm_set
str = """
sample_input: -1 5 -4 5 3 -1 -4 5 -1 0 0 4 -1 0 1 4 4 4
-4 2 -2 -4 4 -5 -1 4 -1 2 5 -3 -1 3 2 -3
XXXZXZXXZXZXXXZXXZX
5
"""
str] = sample_input.strip().split("\n")
input_lines: List[
int]] = []
spectral_vectors: List[List[int = 0
idx: while idx < len(input_lines) and (is_number(input_lines[idx][0]) or is_number(input_lines[idx][:2])):
int] = [int(x) for x in input_lines[idx].strip().split()]
vector: List[
spectral_vectors.append(vector)+= 1
idx
str = input_lines[idx].strip()
proteome: int = int(input_lines[idx + 1])
threshold:
str] = search_peptide_spectrum_matches(spectral_vectors, proteome, threshold)
result: Set[
for peptide in result:
print(peptide)
122 Compute the Size of a Spectral Dictionary
Size of Spectral Dictionary Problem. Find the size of the spectral dictionary for a given spectrum and score threshold.
Given: A spectral vector Spectrum’, an integer threshold, and an integer max_score.
Return: The size of the dictionary *Dictionary__threshold(Spectrum’*).
Note: Use the provided max_score for the height of your table. Your answer should be the number of peptides whose score is at least threshold and at most max_score.
Note: In this chapter, all dataset problems implicitly use the standard integer-valued mass table for the regular twenty amino acids. Examples sometimes use imaginary amino acids X and Z having respective integer masses 4 and 5.
122.1 Sample Dataset
4 -3 -2 3 3 -4 5 -3 -1 -1 3 4 1 3
1
8
122.2 Sample Output
0
122.3 Solution
from typing import Dict, List
str, int] = {
amino_acid_masses: Dict['A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131,
'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163
}int] = list(amino_acid_masses.values())
mass_values: List[
def calculate_spectral_dictionary_size(spectral_vector: List[int], threshold: int, max_score: int) -> int:
int = len(spectral_vector)
vector_length:
int, Dict[int, int]] = {0: {0: 1}}
size_matrix: Dict[for t in range(1, max_score + 1):
0][t] = 0
size_matrix[
for i in range(1, vector_length + 1):
= {}
size_matrix[i] for t in range(max_score + 1):
= 0
size_matrix[i][t] for mass in mass_values:
if (i - mass) >= 0 and (t - spectral_vector[i - 1]) >= 0 and (t - spectral_vector[i - 1]) <= max_score:
+= size_matrix[i - mass][t - spectral_vector[i - 1]]
size_matrix[i][t]
int = sum(size_matrix[vector_length][t] for t in range(threshold, max_score + 1))
final_size:
return final_size
str = """
sample_input: 4 -3 -2 3 3 -4 5 -3 -1 -1 3 4 1 3
1
8
"""
str] = sample_input.strip().split("\n")
input_lines: List[int] = [int(x) for x in input_lines[0].strip().split()]
spectral_vector: List[int = int(input_lines[1])
threshold: int = int(input_lines[2])
max_score:
print(calculate_spectral_dictionary_size(spectral_vector, threshold, max_score))
123 Compute the Probability of a Spectral Dictionary
Probability of Spectral Dictionary Problem. Find the probability of the spectral dictionary for a given spectrum and score threshold.
Given: A spectral vector Spectrum’, an integer threshold, and an integer max_score.
Return: The probability of the dictionary *Dictionary__threshold(Spectrum’*).
Note: Use the provided max_score for the height of your table.
Note: In this chapter, all dataset problems implicitly use the standard integer-valued mass table for the regular twenty amino acids. Examples sometimes use imaginary amino acids X and Z having respective integer masses 4 and 5.
123.1 Sample Dataset
4 -3 -2 3 3 -4 5 -3 -1 -1 3 4 1 3
1
8
123.2 Sample Output
0
123.3 Solution
from typing import Dict, List
str, int] = {
amino_acid_masses: Dict['A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131,
'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163
}int] = list(amino_acid_masses.values())
mass_values: List[
def calculate_spectral_dictionary_probability(spectral_vector: List[int], threshold: int, max_score: int) -> float:
int = len(spectral_vector)
vector_length:
int, Dict[int, float]] = {0: {0: 1.0}}
probability_matrix: Dict[for t in range(1, max_score + 1):
0][t] = 0.0
probability_matrix[
for i in range(1, vector_length + 1):
= {}
probability_matrix[i] for t in range(max_score + 1):
= 0.0
probability_matrix[i][t] for mass in mass_values:
if (i - mass) >= 0 and (t - spectral_vector[i - 1]) >= 0 and (t - spectral_vector[i - 1]) <= max_score:
+= probability_matrix[i - mass][t - spectral_vector[i - 1]]
probability_matrix[i][t] /= 20
probability_matrix[i][t]
float = sum(probability_matrix[vector_length][t] for t in range(threshold, max_score + 1))
final_probability:
return final_probability
str = """
sample_input: 4 -3 -2 3 3 -4 5 -3 -1 -1 3 4 1 3
1
8
"""
str] = sample_input.strip().split("\n")
input_lines: List[int] = [int(x) for x in input_lines[0].strip().split()]
spectral_vector: List[int = int(input_lines[1])
threshold: int = int(input_lines[2])
max_score:
print(calculate_spectral_dictionary_probability(spectral_vector, threshold, max_score))
124 Find a Highest-Scoring Modified Peptide against a Spectrum
Spectral Alignment Problem. Given a peptide and a spectral vector, find a modified variant of this peptide that maximizes the peptide-spectrum score among all variants of the peptides with up to k modifications.
Given: A peptide Peptide, a spectral vector Spectrum’, and an integer k.
Return: A peptide Peptide’ related to Peptide by up to k modifications with maximal score against Spectrum’ out of all possibilities.
Note: In this chapter, all dataset problems implicitly use the standard integer-valued mass table for the regular twenty amino acids. Examples sometimes use imaginary amino acids X and Z having respective integer masses 4 and 5.
124.1 Sample Dataset
XXZ
4 -3 -2 3 3 -4 5 -3 -1 -1 3 4 1 3
2
124.2 Sample Output
XX(-1)Z(+2)
124.3 Solution
from typing import Dict, List
str, int] = {
amino_acid_masses: Dict['A': 71, 'C': 103, 'E': 129, 'D': 115, 'G': 57, 'F': 147, 'I': 113, 'H': 137, 'K': 128, 'M': 131,
'L': 113, 'N': 114, 'Q': 128, 'P': 97, 'S': 87, 'R': 156, 'T': 101, 'W': 186, 'V': 99, 'Y': 163,
'X': 4, 'Z': 5 # Added 'X' and 'Z' with arbitrary masses
}
def print_score_matrix(score_matrix: Dict[int, Dict[int, Dict[int, float]]], prefix_masses: List[int], spectral_vector: List[int], max_k: int) -> None:
for t in range(-2, max_k + 1):
if t == -2:
= [str(x).rjust(3) for x in spectral_vector]
formatted_vector print(' ', *formatted_vector)
print(' ')
print(' ')
elif t == -1:
= [str(i).rjust(3) for i in range(len(spectral_vector))]
header print(' ', *header)
else:
for mass in prefix_masses:
= []
row for j in range(-1, len(spectral_vector)):
if j == -1:
f"{mass:>3} ")
row.append(else:
= score_matrix[mass][j][t]
score if score < -1e5:
= 'XX'
score f"{score:>3}")
row.append(print(' '.join(row))
print(' ')
return None
def spectral_alignment(peptide: str, spectral_vector: List[int], max_k: int) -> str:
0, 0)
spectral_vector.insert(
# Calculate prefix masses
= [0]
prefix_masses for i in range(len(peptide)):
= peptide[:i + 1]
prefix = sum(amino_acid_masses[aa] for aa in prefix)
mass
prefix_masses.append(mass)
# Create diff array
= {}
mass_differences for i in range(1, len(prefix_masses)):
= prefix_masses[i] - prefix_masses[i - 1]
mass_differences[prefix_masses[i]]
# Initialize scores
int, Dict[int, Dict[int, float]]] = {}
score_matrix: Dict[for mass in prefix_masses:
= {}
score_matrix[mass] for j in range(len(spectral_vector)):
= {t: -float("inf") for t in range(max_k + 1)}
score_matrix[mass][j] 0][0][0] = 0
score_matrix[
# Calculate scores
for mass in prefix_masses[1:]:
for j in range(len(spectral_vector)):
for t in range(max_k + 1):
if (t == 0) and (mass - mass_differences[mass] >= 0) and (j - mass_differences[mass] >= 0):
= spectral_vector[j] + score_matrix[mass - mass_differences[mass]][j - mass_differences[mass]][t]
score_matrix[mass][j][t] elif (t > 0) and (mass - mass_differences[mass] >= 0) and (j - mass_differences[mass] >= 0):
= spectral_vector[j] + max(score_matrix[mass - mass_differences[mass]][j - mass_differences[mass]][t],
score_matrix[mass][j][t] max(score_matrix[mass - mass_differences[mass]][j_star][t - 1] for j_star in range(j)))
elif (t > 0) and (mass - mass_differences[mass] >= 0) and (j > 0):
= spectral_vector[j] + max(score_matrix[mass - mass_differences[mass]][j_star][t - 1] for j_star in range(j))
score_matrix[mass][j][t]
# Find max score layer
= -float("inf")
max_score = 0
max_layer for t in range(max_k + 1):
= score_matrix[prefix_masses[-1]][len(spectral_vector) - 1][t]
current_score if current_score > max_score:
= current_score
max_score = t
max_layer
# Backtrace
= max_layer
layer = len(spectral_vector) - 1
column_index
= ''
result_peptide for i in range(len(peptide), 0, -1):
= prefix_masses[i]
pre_mass if (column_index - mass_differences[pre_mass] >= 0) and (
== spectral_vector[column_index] + score_matrix[pre_mass - mass_differences[pre_mass]][column_index - mass_differences[pre_mass]][layer]):
score_matrix[pre_mass][column_index][layer] -= mass_differences[pre_mass]
column_index = peptide[i - 1] + result_peptide
result_peptide else:
= [score_matrix[pre_mass - mass_differences[pre_mass]][j_star][layer - 1] for j_star in range(column_index)]
temp_scores = temp_scores.index(max(temp_scores))
idx_max_score = column_index - idx_max_score - mass_differences[pre_mass]
modification_amount if modification_amount > 0:
= peptide[i - 1] + '(+' + str(modification_amount) + ')' + result_peptide
result_peptide else:
= peptide[i - 1] + '(' + str(modification_amount) + ')' + result_peptide
result_peptide = idx_max_score
column_index -= 1
layer
return result_peptide
str = """
sample_input: XXZ
4 -3 -2 3 3 -4 5 -3 -1 -1 3 4 1 3
2
"""
str] = sample_input.strip().split("\n")
input_lines: List[str = input_lines[0]
peptide_sequence: int] = [int(x) for x in input_lines[1].strip().split()]
spectral_vector_values: List[int = int(input_lines[2])
max_k_value:
print(spectral_alignment(peptide_sequence, spectral_vector_values, max_k_value))