
curr_name, curr_seq, dic_name_to_seq = '', '', {}

with open('gencode.v41.lncRNA_transcripts.fa','r') as f_in:
    for i,line in enumerate(f_in):
        # if line.startswith('>'):
        if line [0]==('>'):
          if curr_seq:
            dic_name_to_seq[curr_name] = curr_seq
          curr_name = line.strip('\n')[1:]
          curr_seq = ''
        else:
          curr_seq += line.strip('\n')

if curr_seq:
    dic_name_to_seq[curr_name] = curr_seq

print(f'The number of sequences we obtained from file, {len(dic_name_to_seq)}')

# we want to find seqeunces with Gencode Transcript ID ENST00000441724.1 and Ensembl Gene ID ENSG00000225077.3
# Gencode Transcript | Ensembl Gene ID | havana_gene | other annotation/description....
# ENST00000441724.1|ENSG00000225077.3|OTTHUMG00000001255.5

target_name, target_seq = '', ''

for name in dic_name_to_seq.keys():

    if name.startswith('ENST00000441724.1|ENSG00000225077.3'):
      target_name = name
      target_seq = dic_name_to_seq[target_name]
      break

print(f'Sequence name:{target_name}')
print(f'Sequence length:{len(target_seq)}')
print(f'Sequence:{target_seq}')

# we can use Counter as Defaultdict(int)
from collections import defaultdict

k = 6

counter = defaultdict(int)

for i in range(k, len(target_seq)+1):

  counter[target_seq[i-k:i]] += 1

sorted_k_mers = sorted(list(counter.keys()), \
                       key = lambda x : counter[x], \
                       reverse = True)

print(f"Most frequent 6-mers:{sorted_k_mers[0]}")

print(f"Most frequent 6-mers occured {counter[sorted_k_mers[0]]} times.")

print('\nTop 5 K-mers and frequencies:')
for k_mers in sorted_k_mers[:5]:
  print(f'\t{k_mers} =  {counter[k_mers]} times')


#an alternative
from collections import Counter

k = 6

counter = Counter()

for i in range(k, len(target_seq)+1):

  counter[target_seq[i-k:i]] += 1

sorted_k_mers = sorted(list(counter.keys()), key = lambda x : counter[x], reverse = True)

print(f"Most frequent 6-mers:{sorted_k_mers[0]}")

print(f"Most frequent 6-mers occured {counter[sorted_k_mers[0]]} times.")

print('\nTop 5 K-mers and frequencies:')
for k_mers in sorted_k_mers[:5]:
  print(f'\t{k_mers} =  {counter[k_mers]} times')

# find top k-mers for any k
#we can use Counter instead of Defaultdict(int)
from collections import Counter


def find_top5_k_mers(k, seq):

  counter = Counter()

  for i in range(k, len(seq)+1):

    counter[seq[i-k:i]] += 1

  sorted_k_mers = sorted(list(counter.keys()), key = lambda x : counter[x], reverse = True)

  print(f"Most frequent {k}-mers:{sorted_k_mers[0]}")

  print(f"Most frequent {k}-mers occured {counter[sorted_k_mers[0]]} times.")

  print(f'\nTop 5 {k}-mers and frequencies:')
  for k_mers in sorted_k_mers[:5]:
    print(f'\t{k_mers} =  {counter[k_mers]} times')

find_top5_k_mers(5, target_seq)

# we can use Counter instead of Defaultdict(int)
from collections import Counter


def find_top_n_k_mers(n, k, seq):

  counter = Counter()

  for i in range(k, len(seq)+1):

    counter[seq[i-k:i]] += 1

  sorted_k_mers = sorted(list(counter.keys()), key = lambda x : counter[x], reverse = True)

  print(f"Most frequent {k}-mers:{sorted_k_mers[0]}")

  print(f"Most frequent {k}-mers occured {counter[sorted_k_mers[0]]} times.")

  print(f'\nTop {n} {k}-mers and frequencies:')
  for k_mers in sorted_k_mers[:n]:
    print(f'\t{k_mers} =  {counter[k_mers]} times')
find_top_n_k_mers(2, 9, target_seq)

# we can write content to file like this
f_write = open('new_file.txt', 'w')
print("Some content we'd like to write.", file=f_write)
f_write.close()

#an alternative to write to a file
def write_k_mers_frequencies_to_file(n, k, seq, file_name):

  counter = Counter()

  for i in range(k, len(seq)+1):

    counter[seq[i-k:i]] += 1

  sorted_k_mers = sorted(list(counter.keys()), key = lambda x : counter[x], reverse = True)

  with open(file_name, 'w') as f_out:
    print('\t'.join(['k_mer', 'frequency']), file=f_out)
    for k_mer in sorted_k_mers[:n]:

      print(f'{k_mer}\t{counter[k_mer]}', file=f_out)

write_k_mers_frequencies_to_file(5, 7, target_seq, 'k_mer_frequencies.txt')
