Ошибка в коде выравнивания. Разница в результатах оптимизированного Хиршберга и 2D-жадной эвристикиPython

Программы на Python
Ответить Пред. темаСлед. тема
Anonymous
 Ошибка в коде выравнивания. Разница в результатах оптимизированного Хиршберга и 2D-жадной эвристики

Сообщение Anonymous »

Это жадный код выравнивания, предназначенный для:
  • Принимать шаблон, содержащий список последовательностей M, каждая из которых имеет длину M. Это список выровненные последовательности.
  • Использование его в классе AlignedScore, который создает оценку соответствия для выравнивающей последовательности на основе частоты встречаемости нуклеотидов в позиции pos по всей матрице.
  • Выравнивание каждой последовательности во входном файле с шаблоном, добавляя выровненную последовательность в шаблон для дальнейшего выравнивания.
    Это жадное выравнивание:

Код: Выделить всё

class MyClass:
def __init__(self, x):
self.indel = -5
self.d = {}
self.l = len(x)
for i in range(len(x[0])):
self.d[i] = dict()
for j in x:
try:
self.d[i][j[i]] += 1
except KeyError:
self.d[i][j[i]] = 1

def s(self, i, j):
if j == "N":
return 1
return 2 * self.d[i].get(j, 0) / self.l - 1

def greedyAlign(x, s):
o = MyClass(x)
a = [[float("-inf")] * (len(s) + 1) for _ in range(len(x[0]) + 1)]
b = [[None] * (len(s) + 1) for _ in range(len(x[0]) + 1)]

a[0][0] = 0
for i in range(len(x[0]) + 1):
for j in range(len(s) + 1):
if i > 0 and j > 0:
g = o.s(i - 1, s[j - 1])
if a[i - 1][j - 1] + g > a[i][j]:
a[i][j] = a[i - 1][j - 1] + g
b[i][j] = 0
if i > 0 and a[i - 1][j] + o.indel > a[i][j]:
a[i][j] = a[i - 1][j] + o.indel
b[i][j] = 1
if j > 0 and a[i][j - 1] + o.indel > a[i][j]:
a[i][j] = a[i][j - 1] + o.indel
b[i][j] = 2

i = len(x[0])
j = len(s)
c = [""] * len(x)
d = ""

while i > 0 or j > 0:
if b[i][j] == 1:
i -= 1
for e, f in enumerate(c):
c[e] = x[e][i] + f
d = "-" + d
elif b[i][j] == 2:
j -= 1
for e, f in enumerate(c):
c[e] = "-" + f
d = s[j] + d
else:
i -= 1
j -= 1
for e, f in enumerate(c):
c[e] = x[e][i] + f
d = s[j] + d

c.append(d)
return c
Это моя чистая точка зрения и точка зрения Хиршберга:

Код: Выделить всё

class AlignmentScore:
def __init__(self, template):
self.indel = -5 # Horizontal or vertical movement score. Representing insertion or deletion.

# Construct a dictionary storing the number time nucleotide "nt" shows up at position int(pos) across all sequence in template.
# Ex: self.post_dict[0]["A"] = 5 means there are 5 nt "A" at position 0 across len(template) number of sequence

self.pos_dict = {} # Itinialize pos_dict = dict(int(pos), dict("nt", int(count))).
self.template_len = len(template) # Number of seq in template.

for pos in range(len(template[0])): # Iterating across the length of one seq in template.
self.pos_dict[pos] = dict()
for seq in template: # Iterating across all seq in template.
nt = seq[pos] # Nucleotide at pos in seq.
try:
self.pos_dict[pos][nt] += 1 # Update number of the time nucleotide at position pos is nt across all seq in template.
except KeyError:
self.pos_dict[pos][nt] = 1 # Initialize nt count.

def score(self, pos, nt): # Calculate the score of a diagonal movement.  The diagonal movement score is based on how frequently the nt occur at position pos.
# Ex: seq
if nt == "N":
return 1
# Return the normalized score = 2* (Number of nt occurence at pos)/len(seq) -1
return 2 * self.pos_dict[pos].get(nt, 0) / self.template_len - 1 # -1 < Score < 1

def align(template, seq):
'''
Implement the two sequence global alignment using a linear edge algorithm.
Input : template
Output : tuple of aligned DNA sequences
'''
###################implement here#####################
alignment = [] #alignment stored outside of the recursion stored as a list of tuple, each containng the pair of aligned nucleotide of x and y
template_seq_len = len(template[0])
linear_edge_alignment(0,template_seq_len,0,len(seq),template,seq,alignment) #start the recursion from 0 to length of each string

seq_aligned = "".join(alignment) #join the aligned of the sequence
template.append(seq_aligned)
return template

def linear_edge_alignment(top,bottom,left,right,template,seq,alignment):
'''
This recursion divide the sequence into two again and again, storing the aligned best middle node. Until the base case, when len(x) or len(y) ==0, which
is when left=right, top=bottom.
Additionally, the sequence utilize the middle edge from the midnode to cut down on run time further, similar to the book logic.
The entire code is simply an expansion of the LinearEdgeAlignment code given in chapter 5.

Input:
top, bottom (int): Vertical boundaries for x.
left, right (int): Horizontal boundaries for y.
x (str): Sequence 1.
y (str): Sequence 2.
score_param (ScoreParam): Scoring parameters.
alignment (list[tuple]): Aligned sequence pairs.
Output: None
'''
# Treat template as the vertical axis and seq as horiontal axis.
if left == right:
# Align gaps to the remaining template nt when seq is empty
for i in range(top,bottom):
alignment.append('-')
return
elif top == bottom:
# Align remaining seq nt when template is empty
for j in range(left, right):
alignment.append(seq[j])
return
else:
# Recursion to divide and conquer with addition of midedge
middle = (left+right)//2
mid_node, mid_edge = mid_node_and_edge(top,bottom,left,right,template,seq) # Calculate the mide node on the middle column and the mid edge stemming from the MidNode

linear_edge_alignment(top, mid_node, left, middle, template, seq, alignment) # Calling top-left half rectangle recursion
# Output mid-edge. Calcutating the new top-left corner coordinate (mid_node,middle) for the bottom_right rectangle

if mid_edge == 'd':  #Downward edge, the next middle node is (MidNode+1, middle)
alignment.append('-')
mid_node += 1
elif mid_edge == 'r':  #Rightward edge, the next middle node is (MidNode, middle +1)
alignment.append(seq[middle])
middle += 1
elif mid_edge == 'm':  #Diagonal edge, the next middle node is (MidNode+1,middle+1)
alignment.append(seq[middle])
mid_node += 1
middle += 1
linear_edge_alignment(mid_node, bottom, middle, right, template, seq, alignment) #Second recursion call of bottom-left rectangle.

def mid_node_and_edge(top,bottom,left,right,template,seq):
'''
input:
top, bottom (int): Vertical boundaries for x.
left, right (int): Horizontal boundaries for y.
x (str): Sequence 1.
y (str): Sequence 2.
score_param (ScoreParam): Scoring parameters.
output:
tuple(MidNode (int),MidEdge(str)): Index of the MidNode and direction of the MidEdge.

The purpose is to calculate the MiddleNode on the middle column that the longest path must pass through. Along with it, we calculate the edge or the next path from the middle node.
'''
current_template = [s[top:bottom] for s in template]
# Initialized current seq from left to right and create current template with each seq from top to bottom.
current_seq = seq[left:right]

middle = (right - left)//2 #middle of s2. Example: 6-2//2 = 2, which is middle from 0-4, not for s2, which would be 2+left = 4.

fromSource = scoring(current_template,current_seq[:middle], reverse_template=False)[0] #The matrix storing the score of longest path possible from the Source to each point in the middle column.

#The two matrix storing the score of longest path possble from the Sink back to middle column, calculated by aligning the reversed s1 and s2[middle:]
#Backtrack store the direction of the move taken to reach each point in the middle column. It is the direction of the best move to reach (i,middle).
toSink,backtrack = scoring(current_template, current_seq[middle:], reverse_template = True)

total = [0]* len(fromSource)
for i in range(len(fromSource)):
#Calculate the longest length of the paths that passes through each point on the middle column
total[i] = fromSource[i] + toSink[i]

max_index = total.index(max(total)) #Index of MidNode, node that the longest path passes through, on s1.
MidNode = top + max_index #MidNode index on full length sequence x

MidEdge = backtrack[max_index] #Direction of edge from MidNode on the longest path.

return MidNode, MidEdge

def scoring(template,seq, reverse_template = False):
'''
Aligning the current_seq with the template. Possible to do so in reverse to calculate alignment score from Sink to middle column.
'''
if reverse_template: # Reversing each sequence in the template if calculating toSink
current_template = [s[::-1] for s in template]
current_seq = seq[::-1]
else:
current_template = template
current_seq = seq

template_sequence_len = len(current_template[0]) # Length of each sequence in template.
alignment_score = AlignmentScore(current_template)

# Initializing three 1D list, storing the previous column score, current column score, and backtrack movement.
previous_score = [0] * (template_sequence_len+1)
current_score = [0] * (template_sequence_len+1)
backtrack = [None]*(template_sequence_len+1)

for i in range(1,template_sequence_len+1): # Initiating the alignment matrix
current_score[i]= current_score[i-1] + alignment_score.indel

for i in range(1, len(current_seq)+1): # Calculating the alignment score of each column from left -> right
previous_score = current_score[:] # Updating the previous column
current_score[0] = previous_score[0] + alignment_score.indel # Initiating the alignment score of the first row in the current column.
for j in range(1,template_sequence_len+1): # Calculate alignment score of each row from top -> bottom.
# Calculating the score of each movement direction.
diagonal = previous_score[j-1] + alignment_score.score(j-1,current_seq[i-1])
vertical = current_score[j-1] + alignment_score.indel
horizontal = previous_score[j] +alignment_score.indel

best = max(diagonal,horizontal,vertical)
current_score[j] = best

#storing the movement used to reach the current[j].
if best == diagonal:
backtrack[j] = "m"
elif best == vertical:
backtrack[j] = "d"
elif best == horizontal:
backtrack[j] = "r"
if reverse_template: # Reversing the alignment score and backtrack function if calculated the alignment from Sink.
current_score = current_score[::-1]
backtrack = backtrack[::-1]
return current_score, backtrack
Однако. для вывода hisrchberg хорошо справляется с случаем без вставки «-». Тем не менее, выходные данные для hirschberg Linear_edge_alignment содержат ошибочно вставленный знак «-». Например, для жадных будет вывод:
GT--G----GAGT
Вместо этого версия hirschberg выведет следующее:
GTG-G-----AGT< /p>
Я заметил, что сдвиг постоянно равен одному «-» или, в некоторых случаях, отсутствует «-» в конце и начале.
2D жадный :
---A-TGTTTG
Хиршберг:
ATGTTTG
Я не понимаю, почему это так. Пожалуйста, помогите

Подробнее здесь: https://stackoverflow.com/questions/792 ... berg-vs-2d
Реклама
Ответить Пред. темаСлед. тема

Быстрый ответ

Изменение регистра текста: 
Смайлики
:) :( :oops: :roll: :wink: :muza: :clever: :sorry: :angel: :read: *x)
Ещё смайлики…
   
К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми.

Максимально разрешённый размер вложения: 15 МБ.

  • Похожие темы
    Ответы
    Просмотры
    Последнее сообщение

Вернуться в «Python»