
dic = {}
with open(''C:/users/xiaoman/downloads/bedExample2.bed'', 'r') as f_in:
  for i, line in enumerate(f_in):
    
    # these code only for demo purpose
    if i == 0:
      print(line.split('\t'))

    content = line.strip('\n').split('\t')
    chrom, chrom_start, chrom_end, gene_name = content[:4]
    # change data type
    chrom_start, chrom_end = int(chrom_start), int(chrom_end)
    length = chrom_end - chrom_start + 1

    dic[(chrom, chrom_start, chrom_end, gene_name)] = length
    
# now we sort 
data = sorted(list(dic.keys()), key = lambda x : (dic[x], x)) 


data = []
with open('c:/users/xiaoman/downloads/bedExample2.bed', 'r') as f_in:
  for i, line in enumerate(f_in):
    
    # these code only for demo purpose
    if i == 0:
      print('orginal', line)
      print('Before strip', line.split('\t'))
      print('After strip', line.strip('\n').split('\t'))

    content = line.strip('\n').split('\t')
    chrom, chrom_start, chrom_end, gene_name = content[:4]
    # change data type
    chrom_start, chrom_end = int(chrom_start), int(chrom_end)
    length = chrom_end - chrom_start + 1
    data.append([length, chrom, chrom_start, chrom_end, gene_name])

# now we sort 
data.sort() 
# now we output
data[:3]
data[:-3] #last three rows

# Output the largest 5 genes. 
for idx in range(len(data)-1, len(data)-6, -1):
  print(f"chrom : {data[idx][1]}, chrom_start : {data[idx][2]}, chrom_end : {data[idx][3]}, gene_name: {data[idx][4]}, length : {data[idx][0]}")

# Output the smallest 5 genes.
for idx in range(0, 5):
  print(f"chrom : {data[idx][1]}, chrom_start : {data[idx][2]}, chrom_end : {data[idx][3]}, gene_name: {data[idx][4]}, length : {data[idx][0]}")

  

# An alternative approach based on dictionary.

data = [] # list to store data
dic = {} # dictionary to store length
with open('bedExample2.bed', 'r') as f_in:
  for i, line in enumerate(f_in):
    
    # these code only for demo purpose
    if i == 0:
      print(line.split('\t'))

    content = line.strip('\n').split('\t')
    chrom, chrom_start, chrom_end, gene_name = content[:4]
    # change data type
    chrom_start, chrom_end = int(chrom_start), int(chrom_end)
    length = chrom_end - chrom_start + 1
    data.append((chrom, chrom_start, chrom_end, gene_name))
    dic[(chrom, chrom_start, chrom_end, gene_name)] = length
    
# now we sort 
data.sort(key = lambda x : (dic[x], x))


# Output the largest 5 genes. 
for idx in range(len(data)-1, len(data)-6, -1):
  print(f"chrom : {data[idx][0]}, chrom_start : {data[idx][1]}, chrom_end : {data[idx][2]}, gene_name: {data[idx][3]}, length : {dic[data[idx]]}")


# third options
# dictionary, key is tuple of content, value is the length
dic = {}
with open('c:/users/xiaoman/downloads/bedExample2.bed', 'r') as f_in:
  for i, line in enumerate(f_in):
    
    # these code only for demo purpose
    if i == 0:
      print(line.split('\t'))

    content = line.strip('\n').split('\t')
    chrom, chrom_start, chrom_end, gene_name = content[:4]
    # change data type
    chrom_start, chrom_end = int(chrom_start), int(chrom_end)
    length = chrom_end - chrom_start + 1

    dic[(chrom, chrom_start, chrom_end, gene_name)] = length
    
# now we sort 
data = sorted(list(dic.keys()), key = lambda x : (dic[x], x)) 

  
#The fourth choice
data = []
with open('bedExample2.bed', 'r') as f_in:
  for i, line in enumerate(f_in):
    
    # these code only for demo purpose
    if i == 0:
      print('orginal', line)
      print('Before strip', line.split('\t'))
      print('After strip', line.strip('\n').split('\t'))

    content = line.strip('\n').split('\t')
    chrom, chrom_start, chrom_end, gene_name = content[:4]
    # change data type
    chrom_start, chrom_end = int(chrom_start), int(chrom_end)
    data.append([chrom, chrom_start, chrom_end, gene_name])

# now we sort 
data.sort(key = lambda x : (x[2]-x[1], x))

