My code is meant to transform a text file in the format below:
> gene name
gene sequence
to a csv file where in each row, 1 column has a gene name and other has a gene sequence. However, some of the gene sequences flows over to the next row.
How can I prevent this from happening?
def parse_gene_file(input_file):
genes = []
with open(input_file, 'r') as file:
gene_name = None
gene_sequence = []
for line in file:
line = line.strip()
if line.startswith('>'):
# It means we have just finished reading the sequence of a gene.
if gene_name is not None:
genes.append((gene_name, ''.join(gene_sequence)))
print(f"Added gene: {gene_name} with sequence: {''.join(gene_sequence)}")
gene_name = line[1:].strip() # Set gene_name to the new gene name (strip the > and any leading/trailing whitespace)
gene_sequence = []
else:
gene_sequence.append(line)
if gene_name is not None:
genes.append((gene_name, ''.join(gene_sequence))) # ensure the last gene is added to the genes list.
print(f"Added gene: {gene_name} with sequence: {''.join(gene_sequence)}")
return genes
def write_to_csv(genes, output_file):
with open(output_file, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['Gene Name', 'Gene Sequence'])
writer.writerows(genes)
def main(input_file, output_file):
genes = parse_gene_file(input_file)
write_to_csv(genes, output_file)
if __name__ == "__main__":
input_file = r'C:txt' # input file name
output_file = r'C:csv' # output file name
main(input_file, output_file)
You can't. CSV files don't maintain column width information. They are just text files without formatting information. Double click the line between column A and B and then column B and C in Excel to adjust the width to the content.
As an alternative, use a package that supports XLSX output and specify the column width there.