1.4 리딩 ν”„λ ˆμž„

🧬 λ‹¨λ°±μ§ˆμ˜ λ²ˆμ—­ 과정은 λ°˜λ“œμ‹œ ATG μ½”λˆμ—μ„œ μ‹œμž‘ν•˜κΈ° λ•Œλ¬Έμ— 이것을 μ°Ύμ•„μ•Όλ§Œ μ–΄λ””κ°€ λ²ˆμ—­μ˜ μ‹œμž‘μ μΈμ§€ μ•Œ 수 μžˆλ‹€.
🧬 ν•˜μ§€λ§Œ 이 μ½”λˆμ΄ 항상 μ„œμ—΄μ˜ μ‹œμž‘μ μ— μžˆλŠ” 것이 μ•„λ‹ˆκΈ° λ•Œλ¬Έμ— μ°ΎλŠ” 과정이 ν•„μš”ν•˜λ‹€.
🧬 ATGλ₯Ό μ°ΎκΈ° μœ„ν•΄μ„œ μ—ΌκΈ°μ„œμ—΄μ„ λ‚˜λˆ„λŠ” 것을 리딩 ν”„λ ˆμž„μ΄λΌκ³  ν•œλ‹€.
🧬 첫번째문자, λ‘λ²ˆμ§Έλ¬Έμž, μ„Έλ²ˆμ§Έλ¬ΈμžλΆ€ν„° μ‹œμž‘ν•˜λŠ” 리딩 ν”„λ ˆμž„μ΄ μ‘΄μž¬ν•˜λ©°, 이 μ„œμ—΄μ΄ μ£Όν˜•κ°€λ‹₯이 μ•„λ‹λ•Œμ˜ κ²½μš°λ„ κ³ λ €ν•˜μ—¬ ν•˜λ‚˜μ˜ μ—ΌκΈ° μ„œμ—΄μ—λŠ” 총 μ—¬μ„―κ°œμ˜ 리딩 ν”„λ ˆμž„μ΄ μ‘΄μž¬ν•œλ‹€.

1.4.1. μ—¬μ„―κ°œμ˜ λ¦¬λ”©ν”„λ ˆμž„μ—μ„œ DNA μ„œμ—΄μ„ 계산


🧬 DNA μ„œμ—΄μ„ μ•„λ―Έλ…Έμ‚°μœΌλ‘œ λ³€κ²½ν•΄μ£ΌλŠ” translate_seq( ) ν•¨μˆ˜λ₯Ό μ΄μš©ν•˜μ—¬ res λ¦¬μŠ€νŠΈμ— μ•„λ―Έλ…Έμ‚° μ„œμ—΄ μž…λ ₯
🧬 μ—­μƒλ³΄μ„œμ—΄μ„ λ§Œλ“œλŠ” reverse_complement( ) ν•¨μˆ˜λ₯Ό μ΄μš©ν•˜μ—¬ μ—­μƒλ³΄μ„œμ—΄μ— λŒ€ν•œ μ•„λ―Έλ…Έμ‚° μ„œμ—΄μ„ 생성
🧬 μ•„λ―Έλ…Έμ‚° μ„œμ—΄ res λ°˜ν™˜

def reading_frames(dna_seq):
    assert validate_dna(dna_seq), "Invalid DNA sequence"
    res = []
    #DNAμ„œμ—΄
    res.append(translate_seq(dna_seq,0))
    res.append(translate_seq(dna_seq,1))
    res.append(translate_seq(dna_seq,2))
    
    #μ—­μƒλ³΄μ„œμ—΄
    rc = reverse_complement(dna_seq)
    res.append(translate_seq(rc,0))
    res.append(translate_seq(rc,1))
    res.append(translate_seq(rc,2))
    return res

print(reading_frames("ATGGGATCGTAGTCGTACTAGCTAGCTGATGGTACTCGATAGTCTACGTAGCTAGTGGTACTGGATGGTACTCAGTAACAT"))
>> ['MGS_SY_LADGTR_ST_LVVLDGTQ_H', 'WDRSRTS_LMVLDSLRS_WYWMVLSN', 'GIVVVLAS_WYSIVYVASGTGWYSVT',
    'MLLSTIQYH_LRRLSSTIS_LVRLRSH', 'CY_VPSSTTSYVDYRVPSAS_YDYDP', 'VTEYHPVPLAT_TIEYHQLASTTTIP']

πŸ’Š 역상보 μ„œμ—΄μ„ μ‚¬μš©ν•œ μ΄μœ λŠ” μ›λž˜ λ’€μ—μ„œλΆ€ν„° μ½λŠ” 상보적인 μ„œμ—΄μ„ μ•žμ—μ„œλΆ€ν„° μ½μ–΄μ˜€κΈ° μœ„ν•¨μ΄λ‹€.
μ—¬μ„―κ°œμ˜ λ¦¬λ”©ν”„λ ˆμž„μ— λŒ€ν•΄μ„œ κ³„μ‚°ν–ˆκΈ° λ•Œλ¬Έμ— λ°˜ν™˜λœ μ•„λ―Έλ…Έμ‚° λ¦¬μŠ€νŠΈλ„ μ—¬μ„―κ°œμ˜ μ„œμ—΄μ„ 가지고 μžˆλ‹€.

1.4.2. μ•„λ―Έλ…Έμ‚° μ„œμ—΄μ—μ„œ κ°€λŠ₯ν•œ λ‹¨λ°±μ§ˆ 리슀트 생성


🧬 μ˜€ν”ˆλ¦¬λ”© ν”„λ ˆμž„(ORF) : λ‹¨λ°±μ§ˆλ‘œ λ²ˆμ—­λ  κ°€λŠ₯성이 μžˆλŠ” μ„œμ—΄
🧬 λ‹¨λ°±μ§ˆ λ²ˆμ—­μ€ κ°œμ‹œμ½”λˆμ—μ„œ μ‹œμž‘ν•˜μ—¬ μ’…κ²°μ½”λˆμ—μ„œ λλ‚˜κΈ° λ•Œλ¬Έμ— μ’…κ²°μ½”λˆμ˜ 유무λ₯Ό 확인해야함
🧬 all_proteins_rf( ) ν•¨μˆ˜ 생성
🧬 μ•„λ―Έλ…Έμ‚° μ„œμ—΄μ„ λ°›μ•„μ„œ μ˜€ν”ˆλ¦¬λ”©ν”„λ ˆμž„ 확인
🧬 λ‹¨λ°±μ§ˆ μ„œμ—΄ proteins λ°˜ν™˜

def all_proteins_rf(aa_seq):
    aa_seq = aa_seq.upper()
    current_prot = []
    proteins = []
    
    for aa in aa_seq:                                          #1 μ•„λ―Έλ…Έμ‚°μ„œμ—΄(aa_seq)의 μ•„λ―Έλ…Έμ‚°(aa)에 λŒ€ν•˜μ—¬ 
        if aa == "_":                                          #6 아미노산이 _이면
            if current_prot:                                   #7 current_prot에 이미 μ•„λ―Έλ…Έμ‚°(aa)이 μžˆλ‹€λ©΄
                for p in current_prot:                        
                    proteins.append(p)                         #8 aa string을 proteins에 appendν•˜κ³ 
                    current_prot = []                          #9 current_prot을 비움
        else:
            if aa == "M":                                      #2 μ•„λ―Έλ…Έμ‚°(aa)이 M이면
                current_prot.append("")                        #3 current_prot에 λ¬Έμžμ—΄ 자리 생성
            for i in range(len(current_prot)):                 #4 current_prot의 μžλ¦¬μ—
                current_prot[i] += aa                          #5 μ•„λ―Έλ…Έμ‚°(aa) μΆ”κ°€
    
    return proteins

print(all_proteins_rf("MDRYRA_DMGERTY_RRYGMAYRGD_"))
>> ['MDRYRA', 'MGERTY', 'MAYRGD']

M으둜 μ‹œμž‘ν•΄μ„œ _으둜 λλ‚˜λŠ” μ•„λ―Έλ…Έμ‚°μ„œμ—΄λ§Œ λ‹¨λ°±μ§ˆμ„œμ—΄μ΄ λœκ²ƒμ„ ν™•μΈν•˜μž.

1.4.3. λͺ¨λ“  μ˜€ν”ˆ λ¦¬λ”©ν”„λ ˆμž„μ—μ„œ κ°€λŠ₯ν•œ λ‹¨λ°±μ§ˆ 계산


🧬 reading_frames( ) : DNA μ„œμ—΄μ„ λ°›μ•„μ„œ κ°€λŠ₯ν•œ μ•„λ―Έλ…Έμ‚° μ„œμ—΄μ„ λ°˜ν™˜ (6개)
🧬 all_proteins_rf( ) : μ•„λ―Έλ…Έμ‚° μ„œμ—΄μ„ λ°›μ•„μ„œ μ’…κ²°μ½”λˆμ„ κ³ λ €ν•œ λ‹¨λ°±μ§ˆμ„ λ°˜ν™˜
🧬 all_orfs( ) ν•¨μˆ˜ 생성 - DNA μ„œμ—΄μ„ λ°›μ•„μ„œ λ‹¨λ°±μ§ˆ μ„œμ—΄ λ°˜ν™˜

def all_orfs(dna_seq):
    assert validate_dna(dna_seq), "Invalid DNA sequence"
    rfs = reading_frames(dna_seq)                               #1 rfs - μ•„λ―Έλ…Έμ‚° μ„œμ—΄
    res = []
    for rf in rfs:                                              #2 λͺ¨λ“  ORF에 λŒ€ν•˜μ—¬
        prots = all_proteins_rf(rf)                             #3 prots - λ‹¨λ°±μ§ˆ 리슀트
        for p in prots:
            res.append(p)                                       #4 res에 prots μ›μ†Œλ“€ append
    return res

print(all_orfs("ATGGGATCGTAGTCGTACTAGCTAGCTGATGGTACTCGATAGTCTACGTAGCTAGTGGTACTGGATGGTACTCAGTAACAT"))
>> ['MGS', 'MVLDSLRS', 'MLLSTIQYH']

all_orfs( ) ν•¨μˆ˜μ—μ„œ reading_frames( )κ³Ό all_proteins_rf( )λ₯Ό ν˜ΈμΆœν•˜μ—¬ DNAμ„œμ—΄μ„ λ°›μ•„ ν•œλ²ˆμ— λ‹¨λ°±μ§ˆμ„œμ—΄μ„ λ§Œλ“ λ‹€.

1.4.4. κ°€λŠ₯ν•œ λ‹¨λ°±μ§ˆμ„ μ΅œμ†Œ 크기둜 κ±°λ₯΄κΈ°


🧬 μ‹€μ œλ‘œλŠ” λ‹¨λ°±μ§ˆ 크기가 μž‘μ„μˆ˜λ‘ λ°œν˜„ν•  κ°€λŠ₯성이 크기 λ•Œλ¬Έμ— 이λ₯Ό κ³ λ €ν•΄μ£Όλ©΄ μ’‹μŒ
🧬 all_orfs_ord( ) ν•¨μˆ˜ 생성 - DNA μ„œμ—΄κ³Ό μ΅œμ†Œ 크기λ₯Ό 인수둜 λ°›μ•„ λ‹¨λ°±μ§ˆ μ„œμ—΄μ„ ν¬κΈ°λ³„λ‘œ μ •λ ¬ν•΄μ£ΌλŠ” ν•¨μˆ˜

#μ •λ ¬ μ‚½μž… ν•¨μˆ˜
def all_orfs_ord(dna_seq, minsize = 0):
    assert validate_dna(dna_seq), "Invalid DNA sequence"
    rfs = reading_frames(dna_seq)
    res = []
    for rf in rfs:
        prots = all_proteins_rf(rf)
        for p in prots:
            if len(p) > minsize:                                          #1 λ‹¨λ°±μ§ˆμ˜ 크기가 μ΅œμ†Œν¬κΈ°λ³΄λ‹€ 크면
                insert_prot_ord(p,res)                                    #2 insert_prot_ord() μ—°μ‚° 진행
    return res                                                                           

#μ •λ ¬ν•¨μˆ˜
def insert_prot_ord(prot, list_prots):
    i = 0                                                                  #3 i = 0 μ΄ˆκΈ°ν™”
    while i < len(list_prots) and len(prot) < len(list_prots[i]):          #4 κΈ°μ‘΄ i 인덱슀의 λ‹¨λ°±μ§ˆ 크기가 더 크면
        i += 1                                                             #5 인덱슀 κ°’ i 에 1μ”© 더함 - λ‚΄λ¦Όμ°¨μˆœ μ •λ ¬
    list_prots.insert(i, prot)                                             #6 μ΅œμ’… i μœ„μΉ˜μ— μ •λ ¬ / μ‚½μž…
    
print(all_orfs_ord("ATGGGATCGTAGTCGTACTAGCTAGCTGATGGTACTCGATAGTCTACGTAGCTAGTGGTACTGGATGGTACTCAGTAACAT"))
>> ['MLLSTIQYH', 'MVLDSLRS', 'MGS']

λ‹¨λ°±μ§ˆ μ„œμ—΄μ΄ 큰 μˆœμ„œλŒ€λ‘œ μ •λ ¬λ˜μ—ˆλ‹€.


πŸ’‘ Bioinformatics Algorithms(μ—μ΄μ½˜μΆœνŒ, 2020)λ₯Ό κ³΅λΆ€ν•˜κ³  개인 ν•™μŠ΅μš©μœΌλ‘œ μ •λ¦¬ν•œ μžλ£Œμž…λ‹ˆλ‹€.

Leave a comment