Принимать шаблон, содержащий список последовательностей M, каждая из которых имеет длину M. Это список выровненные последовательности.
Использование его в классе AlignedScore, который создает оценку соответствия для выравнивающей последовательности на основе частоты встречаемости нуклеотидов в позиции pos по всей матрице.
Выравнивание каждой последовательности во входном файле с шаблоном, добавляя выровненную последовательность в шаблон для дальнейшего выравнивания.
Это жадное выравнивание:
class MyClass:
def __init__(self, x):
self.indel = -5
self.d = {}
self.l = len(x)
for i in range(len(x[0])):
self.d[i] = dict()
for j in x:
try:
self.d[i][j[i]] += 1
except KeyError:
self.d[i][j[i]] = 1
def s(self, i, j):
if j == "N":
return 1
return 2 * self.d[i].get(j, 0) / self.l - 1
def greedyAlign(x, s):
o = MyClass(x)
a = [[float("-inf")] * (len(s) + 1) for _ in range(len(x[0]) + 1)]
b = [[None] * (len(s) + 1) for _ in range(len(x[0]) + 1)]
a[0][0] = 0
for i in range(len(x[0]) + 1):
for j in range(len(s) + 1):
if i > 0 and j > 0:
g = o.s(i - 1, s[j - 1])
if a[i - 1][j - 1] + g > a[i][j]:
a[i][j] = a[i - 1][j - 1] + g
b[i][j] = 0
if i > 0 and a[i - 1][j] + o.indel > a[i][j]:
a[i][j] = a[i - 1][j] + o.indel
b[i][j] = 1
if j > 0 and a[i][j - 1] + o.indel > a[i][j]:
a[i][j] = a[i][j - 1] + o.indel
b[i][j] = 2
i = len(x[0])
j = len(s)
c = [""] * len(x)
d = ""
while i > 0 or j > 0:
if b[i][j] == 1:
i -= 1
for e, f in enumerate(c):
c[e] = x[e][i] + f
d = "-" + d
elif b[i][j] == 2:
j -= 1
for e, f in enumerate(c):
c[e] = "-" + f
d = s[j] + d
else:
i -= 1
j -= 1
for e, f in enumerate(c):
c[e] = x[e][i] + f
d = s[j] + d
c.append(d)
return c
Это моя чистая точка зрения и точка зрения Хиршберга:
class AlignmentScore:
def __init__(self, template):
self.indel = -5 # Horizontal or vertical movement score. Representing insertion or deletion.
# Construct a dictionary storing the number time nucleotide "nt" shows up at position int(pos) across all sequence in template.
# Ex: self.post_dict[0]["A"] = 5 means there are 5 nt "A" at position 0 across len(template) number of sequence
self.pos_dict = {} # Itinialize pos_dict = dict(int(pos), dict("nt", int(count))).
self.template_len = len(template) # Number of seq in template.
for pos in range(len(template[0])): # Iterating across the length of one seq in template.
self.pos_dict[pos] = dict()
for seq in template: # Iterating across all seq in template.
nt = seq[pos] # Nucleotide at pos in seq.
try:
self.pos_dict[pos][nt] += 1 # Update number of the time nucleotide at position pos is nt across all seq in template.
except KeyError:
self.pos_dict[pos][nt] = 1 # Initialize nt count.
def score(self, pos, nt): # Calculate the score of a diagonal movement. The diagonal movement score is based on how frequently the nt occur at position pos.
# Ex: seq
if nt == "N":
return 1
# Return the normalized score = 2* (Number of nt occurence at pos)/len(seq) -1
return 2 * self.pos_dict[pos].get(nt, 0) / self.template_len - 1 # -1 < Score < 1
def align(template, seq):
'''
Implement the two sequence global alignment using a linear edge algorithm.
Input : template
Output : tuple of aligned DNA sequences
'''
###################implement here#####################
alignment = [] #alignment stored outside of the recursion stored as a list of tuple, each containng the pair of aligned nucleotide of x and y
template_seq_len = len(template[0])
linear_edge_alignment(0,template_seq_len,0,len(seq),template,seq,alignment) #start the recursion from 0 to length of each string
seq_aligned = "".join(alignment) #join the aligned of the sequence
template.append(seq_aligned)
return template
def linear_edge_alignment(top,bottom,left,right,template,seq,alignment):
'''
This recursion divide the sequence into two again and again, storing the aligned best middle node. Until the base case, when len(x) or len(y) ==0, which
is when left=right, top=bottom.
Additionally, the sequence utilize the middle edge from the midnode to cut down on run time further, similar to the book logic.
The entire code is simply an expansion of the LinearEdgeAlignment code given in chapter 5.
Input:
top, bottom (int): Vertical boundaries for x.
left, right (int): Horizontal boundaries for y.
x (str): Sequence 1.
y (str): Sequence 2.
score_param (ScoreParam): Scoring parameters.
alignment (list[tuple]): Aligned sequence pairs.
Output: None
'''
# Treat template as the vertical axis and seq as horiontal axis.
if left == right:
# Align gaps to the remaining template nt when seq is empty
for i in range(top,bottom):
alignment.append('-')
return
elif top == bottom:
# Align remaining seq nt when template is empty
for j in range(left, right):
alignment.append(seq[j])
return
else:
# Recursion to divide and conquer with addition of midedge
middle = (left+right)//2
mid_node, mid_edge = mid_node_and_edge(top,bottom,left,right,template,seq) # Calculate the mide node on the middle column and the mid edge stemming from the MidNode
linear_edge_alignment(top, mid_node, left, middle, template, seq, alignment) # Calling top-left half rectangle recursion
# Output mid-edge. Calcutating the new top-left corner coordinate (mid_node,middle) for the bottom_right rectangle
if mid_edge == 'd': #Downward edge, the next middle node is (MidNode+1, middle)
alignment.append('-')
mid_node += 1
elif mid_edge == 'r': #Rightward edge, the next middle node is (MidNode, middle +1)
alignment.append(seq[middle])
middle += 1
elif mid_edge == 'm': #Diagonal edge, the next middle node is (MidNode+1,middle+1)
alignment.append(seq[middle])
mid_node += 1
middle += 1
linear_edge_alignment(mid_node, bottom, middle, right, template, seq, alignment) #Second recursion call of bottom-left rectangle.
def mid_node_and_edge(top,bottom,left,right,template,seq):
'''
input:
top, bottom (int): Vertical boundaries for x.
left, right (int): Horizontal boundaries for y.
x (str): Sequence 1.
y (str): Sequence 2.
score_param (ScoreParam): Scoring parameters.
output:
tuple(MidNode (int),MidEdge(str)): Index of the MidNode and direction of the MidEdge.
The purpose is to calculate the MiddleNode on the middle column that the longest path must pass through. Along with it, we calculate the edge or the next path from the middle node.
'''
current_template = [s[top:bottom] for s in template]
# Initialized current seq from left to right and create current template with each seq from top to bottom.
current_seq = seq[left:right]
middle = (right - left)//2 #middle of s2. Example: 6-2//2 = 2, which is middle from 0-4, not for s2, which would be 2+left = 4.
fromSource = scoring(current_template,current_seq[:middle], reverse_template=False)[0] #The matrix storing the score of longest path possible from the Source to each point in the middle column.
#The two matrix storing the score of longest path possble from the Sink back to middle column, calculated by aligning the reversed s1 and s2[middle:]
#Backtrack store the direction of the move taken to reach each point in the middle column. It is the direction of the best move to reach (i,middle).
toSink,backtrack = scoring(current_template, current_seq[middle:], reverse_template = True)
total = [0]* len(fromSource)
for i in range(len(fromSource)):
#Calculate the longest length of the paths that passes through each point on the middle column
total[i] = fromSource[i] + toSink[i]
max_index = total.index(max(total)) #Index of MidNode, node that the longest path passes through, on s1.
MidNode = top + max_index #MidNode index on full length sequence x
MidEdge = backtrack[max_index] #Direction of edge from MidNode on the longest path.
return MidNode, MidEdge
def scoring(template,seq, reverse_template = False):
'''
Aligning the current_seq with the template. Possible to do so in reverse to calculate alignment score from Sink to middle column.
'''
if reverse_template: # Reversing each sequence in the template if calculating toSink
current_template = [s[::-1] for s in template]
current_seq = seq[::-1]
else:
current_template = template
current_seq = seq
template_sequence_len = len(current_template[0]) # Length of each sequence in template.
alignment_score = AlignmentScore(current_template)
# Initializing three 1D list, storing the previous column score, current column score, and backtrack movement.
previous_score = [0] * (template_sequence_len+1)
current_score = [0] * (template_sequence_len+1)
backtrack = [None]*(template_sequence_len+1)
for i in range(1,template_sequence_len+1): # Initiating the alignment matrix
current_score[i]= current_score[i-1] + alignment_score.indel
for i in range(1, len(current_seq)+1): # Calculating the alignment score of each column from left -> right
previous_score = current_score[:] # Updating the previous column
current_score[0] = previous_score[0] + alignment_score.indel # Initiating the alignment score of the first row in the current column.
for j in range(1,template_sequence_len+1): # Calculate alignment score of each row from top -> bottom.
# Calculating the score of each movement direction.
diagonal = previous_score[j-1] + alignment_score.score(j-1,current_seq[i-1])
vertical = current_score[j-1] + alignment_score.indel
horizontal = previous_score[j] +alignment_score.indel
best = max(diagonal,horizontal,vertical)
current_score[j] = best
#storing the movement used to reach the current[j].
if best == diagonal:
backtrack[j] = "m"
elif best == vertical:
backtrack[j] = "d"
elif best == horizontal:
backtrack[j] = "r"
if reverse_template: # Reversing the alignment score and backtrack function if calculated the alignment from Sink.
current_score = current_score[::-1]
backtrack = backtrack[::-1]
return current_score, backtrack
Однако. для вывода hisrchberg хорошо справляется с случаем без вставки «-». Тем не менее, выходные данные для hirschberg Linear_edge_alignment содержат ошибочно вставленный знак «-». Например, для жадных будет вывод:
GT--G----GAGT
Вместо этого версия hirschberg выведет следующее:
GTG-G-----AGT< /p>
Я заметил, что сдвиг постоянно равен одному «-» или, в некоторых случаях, отсутствует «-» в конце и начале.
2D жадный :
---A-TGTTTG
Хиршберг:
ATGTTTG
Я не понимаю, почему это так. Пожалуйста, помогите
Это жадный код выравнивания, предназначенный для: [list] [*]Принимать шаблон, содержащий список последовательностей M, каждая из которых имеет длину M. Это список выровненные последовательности. [*]Использование его в классе AlignedScore, который создает оценку соответствия для выравнивающей последовательности на основе частоты встречаемости нуклеотидов в позиции pos по всей матрице. [*]Выравнивание каждой последовательности во входном файле с шаблоном, добавляя выровненную последовательность в шаблон для дальнейшего выравнивания. Это жадное выравнивание: [/list] [code]class MyClass: def __init__(self, x): self.indel = -5 self.d = {} self.l = len(x) for i in range(len(x[0])): self.d[i] = dict() for j in x: try: self.d[i][j[i]] += 1 except KeyError: self.d[i][j[i]] = 1
def s(self, i, j): if j == "N": return 1 return 2 * self.d[i].get(j, 0) / self.l - 1
def greedyAlign(x, s): o = MyClass(x) a = [[float("-inf")] * (len(s) + 1) for _ in range(len(x[0]) + 1)] b = [[None] * (len(s) + 1) for _ in range(len(x[0]) + 1)]
a[0][0] = 0 for i in range(len(x[0]) + 1): for j in range(len(s) + 1): if i > 0 and j > 0: g = o.s(i - 1, s[j - 1]) if a[i - 1][j - 1] + g > a[i][j]: a[i][j] = a[i - 1][j - 1] + g b[i][j] = 0 if i > 0 and a[i - 1][j] + o.indel > a[i][j]: a[i][j] = a[i - 1][j] + o.indel b[i][j] = 1 if j > 0 and a[i][j - 1] + o.indel > a[i][j]: a[i][j] = a[i][j - 1] + o.indel b[i][j] = 2
i = len(x[0]) j = len(s) c = [""] * len(x) d = ""
while i > 0 or j > 0: if b[i][j] == 1: i -= 1 for e, f in enumerate(c): c[e] = x[e][i] + f d = "-" + d elif b[i][j] == 2: j -= 1 for e, f in enumerate(c): c[e] = "-" + f d = s[j] + d else: i -= 1 j -= 1 for e, f in enumerate(c): c[e] = x[e][i] + f d = s[j] + d
c.append(d) return c [/code] Это моя чистая точка зрения и точка зрения Хиршберга: [code]class AlignmentScore: def __init__(self, template): self.indel = -5 # Horizontal or vertical movement score. Representing insertion or deletion.
# Construct a dictionary storing the number time nucleotide "nt" shows up at position int(pos) across all sequence in template. # Ex: self.post_dict[0]["A"] = 5 means there are 5 nt "A" at position 0 across len(template) number of sequence
self.pos_dict = {} # Itinialize pos_dict = dict(int(pos), dict("nt", int(count))). self.template_len = len(template) # Number of seq in template.
for pos in range(len(template[0])): # Iterating across the length of one seq in template. self.pos_dict[pos] = dict() for seq in template: # Iterating across all seq in template. nt = seq[pos] # Nucleotide at pos in seq. try: self.pos_dict[pos][nt] += 1 # Update number of the time nucleotide at position pos is nt across all seq in template. except KeyError: self.pos_dict[pos][nt] = 1 # Initialize nt count.
def score(self, pos, nt): # Calculate the score of a diagonal movement. The diagonal movement score is based on how frequently the nt occur at position pos. # Ex: seq if nt == "N": return 1 # Return the normalized score = 2* (Number of nt occurence at pos)/len(seq) -1 return 2 * self.pos_dict[pos].get(nt, 0) / self.template_len - 1 # -1 < Score < 1
def align(template, seq): ''' Implement the two sequence global alignment using a linear edge algorithm. Input : template Output : tuple of aligned DNA sequences ''' ###################implement here##################### alignment = [] #alignment stored outside of the recursion stored as a list of tuple, each containng the pair of aligned nucleotide of x and y template_seq_len = len(template[0]) linear_edge_alignment(0,template_seq_len,0,len(seq),template,seq,alignment) #start the recursion from 0 to length of each string
seq_aligned = "".join(alignment) #join the aligned of the sequence template.append(seq_aligned) return template
def linear_edge_alignment(top,bottom,left,right,template,seq,alignment): ''' This recursion divide the sequence into two again and again, storing the aligned best middle node. Until the base case, when len(x) or len(y) ==0, which is when left=right, top=bottom. Additionally, the sequence utilize the middle edge from the midnode to cut down on run time further, similar to the book logic. The entire code is simply an expansion of the LinearEdgeAlignment code given in chapter 5.
Input: top, bottom (int): Vertical boundaries for x. left, right (int): Horizontal boundaries for y. x (str): Sequence 1. y (str): Sequence 2. score_param (ScoreParam): Scoring parameters. alignment (list[tuple]): Aligned sequence pairs. Output: None ''' # Treat template as the vertical axis and seq as horiontal axis. if left == right: # Align gaps to the remaining template nt when seq is empty for i in range(top,bottom): alignment.append('-') return elif top == bottom: # Align remaining seq nt when template is empty for j in range(left, right): alignment.append(seq[j]) return else: # Recursion to divide and conquer with addition of midedge middle = (left+right)//2 mid_node, mid_edge = mid_node_and_edge(top,bottom,left,right,template,seq) # Calculate the mide node on the middle column and the mid edge stemming from the MidNode
linear_edge_alignment(top, mid_node, left, middle, template, seq, alignment) # Calling top-left half rectangle recursion # Output mid-edge. Calcutating the new top-left corner coordinate (mid_node,middle) for the bottom_right rectangle
if mid_edge == 'd': #Downward edge, the next middle node is (MidNode+1, middle) alignment.append('-') mid_node += 1 elif mid_edge == 'r': #Rightward edge, the next middle node is (MidNode, middle +1) alignment.append(seq[middle]) middle += 1 elif mid_edge == 'm': #Diagonal edge, the next middle node is (MidNode+1,middle+1) alignment.append(seq[middle]) mid_node += 1 middle += 1 linear_edge_alignment(mid_node, bottom, middle, right, template, seq, alignment) #Second recursion call of bottom-left rectangle.
def mid_node_and_edge(top,bottom,left,right,template,seq): ''' input: top, bottom (int): Vertical boundaries for x. left, right (int): Horizontal boundaries for y. x (str): Sequence 1. y (str): Sequence 2. score_param (ScoreParam): Scoring parameters. output: tuple(MidNode (int),MidEdge(str)): Index of the MidNode and direction of the MidEdge.
The purpose is to calculate the MiddleNode on the middle column that the longest path must pass through. Along with it, we calculate the edge or the next path from the middle node. ''' current_template = [s[top:bottom] for s in template] # Initialized current seq from left to right and create current template with each seq from top to bottom. current_seq = seq[left:right]
middle = (right - left)//2 #middle of s2. Example: 6-2//2 = 2, which is middle from 0-4, not for s2, which would be 2+left = 4.
fromSource = scoring(current_template,current_seq[:middle], reverse_template=False)[0] #The matrix storing the score of longest path possible from the Source to each point in the middle column.
#The two matrix storing the score of longest path possble from the Sink back to middle column, calculated by aligning the reversed s1 and s2[middle:] #Backtrack store the direction of the move taken to reach each point in the middle column. It is the direction of the best move to reach (i,middle). toSink,backtrack = scoring(current_template, current_seq[middle:], reverse_template = True)
total = [0]* len(fromSource) for i in range(len(fromSource)): #Calculate the longest length of the paths that passes through each point on the middle column total[i] = fromSource[i] + toSink[i]
max_index = total.index(max(total)) #Index of MidNode, node that the longest path passes through, on s1. MidNode = top + max_index #MidNode index on full length sequence x
MidEdge = backtrack[max_index] #Direction of edge from MidNode on the longest path.
return MidNode, MidEdge
def scoring(template,seq, reverse_template = False): ''' Aligning the current_seq with the template. Possible to do so in reverse to calculate alignment score from Sink to middle column. ''' if reverse_template: # Reversing each sequence in the template if calculating toSink current_template = [s[::-1] for s in template] current_seq = seq[::-1] else: current_template = template current_seq = seq
template_sequence_len = len(current_template[0]) # Length of each sequence in template. alignment_score = AlignmentScore(current_template)
# Initializing three 1D list, storing the previous column score, current column score, and backtrack movement. previous_score = [0] * (template_sequence_len+1) current_score = [0] * (template_sequence_len+1) backtrack = [None]*(template_sequence_len+1)
for i in range(1,template_sequence_len+1): # Initiating the alignment matrix current_score[i]= current_score[i-1] + alignment_score.indel
for i in range(1, len(current_seq)+1): # Calculating the alignment score of each column from left -> right previous_score = current_score[:] # Updating the previous column current_score[0] = previous_score[0] + alignment_score.indel # Initiating the alignment score of the first row in the current column. for j in range(1,template_sequence_len+1): # Calculate alignment score of each row from top -> bottom. # Calculating the score of each movement direction. diagonal = previous_score[j-1] + alignment_score.score(j-1,current_seq[i-1]) vertical = current_score[j-1] + alignment_score.indel horizontal = previous_score[j] +alignment_score.indel
best = max(diagonal,horizontal,vertical) current_score[j] = best
#storing the movement used to reach the current[j]. if best == diagonal: backtrack[j] = "m" elif best == vertical: backtrack[j] = "d" elif best == horizontal: backtrack[j] = "r" if reverse_template: # Reversing the alignment score and backtrack function if calculated the alignment from Sink. current_score = current_score[::-1] backtrack = backtrack[::-1] return current_score, backtrack [/code] Однако. для вывода hisrchberg хорошо справляется с случаем без вставки «-». Тем не менее, выходные данные для hirschberg Linear_edge_alignment содержат ошибочно вставленный знак «-». Например, для жадных будет вывод: GT--G----GAGT Вместо этого версия hirschberg выведет следующее: GTG-G-----AGT< /p> Я заметил, что сдвиг постоянно равен одному «-» или, в некоторых случаях, отсутствует «-» в конце и начале. 2D жадный : ---A-TGTTTG Хиршберг: ATGTTTG Я не понимаю, почему это так. Пожалуйста, помогите
Мне был предоставлен сгенерированный CSV-файл со сведениями о 6000 учащихся в формате:
Tutorial Group,Student ID,School,Name,Gender,CGPA
G-1,5002,CS,John Doe,Male,4.02 (пример)
Файл CSV отсортирован по Группе обучения, в каждом уроке участвуют...
Я работаю над проектом по подсчету шагов во входном видео и экспериментирую с методами оценки позы, такими как YOLOv8 и MediaPipe. Моя цель — охватить следующие тестовые случаи:
В кадре находится только верхняя часть тела человека, но он ходит....
Я работаю над проектом по подсчету шагов во входном видео и экспериментирую с методами оценки позы, такими как YOLOv8 и MediaPipe. Моя цель — охватить следующие тестовые случаи:
В кадре находится только верхняя часть тела человека, но он ходит....
У меня есть куча файлов, которые мне нужно скопировать в другое место. Предложите мне оптимизированный подход. В будущем, если в мое приложение будет добавлен какой-либо другой файл, мне не придется изменять код. Существует один подход: мы можем...
Я новичок, изучаю алгоритм. Я узнал, что существует три категории алгоритма Дейкстры с различными методами реализации структуры данных, включая чистый алгоритм Дейкстры, оптимизированный с помощью двоичной кучи и оптимизированный с помощью кучи...