I'm working on a Python script to match the annotated GO IDs from a file, and than extract Level 2 Gene Ontology (GO) IDs from a hierarchy. However, I'm facing issues with extracting the correct Level 2 GO IDs and displaying the corresponding information. Here's what I have so far: I have used go-basic.obo file to classify terms for the annotated GO IDs and their frequencies (total number of go_id) in a file called file.txt. I Expected Output in excell file with following column
The GO ID itself. The corresponding term name from the GO hierarchy. The Level 2 GO ID in the hierarchy. The frequency of the GO ID from file.txt.
I tried with this code, but its not properly extracting go id at level 2 of corroespong annotated go id, here is the code import pandas as pd
def parse_obo(file_path):
terms = {}
current_term = {}
isterm = False
with open(file_path, 'r') as f:
for line in f:
line = line.strip()
isterm = isterm or line.startswith('id:')
if isterm and ": " in line:
key, value = line.split(': ', 1)
if key == "id":
current_term = terms.setdefault(value, {})
current_term["id"] = value
else:
current_term.setdefault(key, []).append(value)
return terms
def make_hierarchy(terms):
for term in list(terms.values()):
term.setdefault("children", [])
if "is_a" in term:
for is_a in term["is_a"]:
parent = is_a.split()[0]
terms.setdefault(parent, { 'id': parent }).setdefault("children", []).append(term)
return [term for term in terms.values() if "is_a" not in term]
def calculate_go_frequency(file_path):
go_frequency = {}
with open(file_path, 'r') as f:
for line in f:
_, go_id = line.strip().split('\t')
go_frequency[go_id] = go_frequency.get(go_id, 0) + 1
return go_frequency
def find_level_2_go_id(go_id, terms, hierarchy):
for term in hierarchy:
if term["id"] == go_id:
return None
level_2_id = None
for child in term.get("children", []):
if child["id"] == go_id:
level_2_id = term["id"]
break
for sub_child in child.get("children", []):
if sub_child["id"] == go_id:
level_2_id = child["id"]
break
if level_2_id:
break
if level_2_id:
return level_2_id
else:
level_2_id = find_level_2_go_id(go_id, terms, term.get("children", []))
if level_2_id:
return level_2_id
return None
if __name__ == "__main__":
file_path = 'go-basic.obo'
terms = parse_obo(file_path)
roots = make_hierarchy(terms)
# Read and process the file.txt file
unique_file_path = 'file.txt'
unique_go_frequency = calculate_go_frequency(unique_file_path)
# Create a list of dictionaries for the data
data = []
for go_id, frequency in unique_go_frequency.items():
term = terms.get(go_id, {})
term_name = term.get("name", "unknown_term")
# Get the Level 2 GO ID based on the hierarchy
level_2_id = find_level_2_go_id(go_id, terms, roots)
data.append({
"GO ID": go_id,
"Term": term_name,
"Level 2 GO ID": level_2_id,
"Frequency": frequency
})
# Create a DataFrame from the data
df = pd.DataFrame(data)
# Save the DataFrame to an Excel file
output_excel = 'gene_counts_with_levels.xlsx'
df.to_excel(output_excel, index=False)
print(f"Data saved to {output_excel}")
kindly help this question is also following the previous question Retrieving Gene Ontology Hierarchy Using Python from this example file, level will be classify like this, given below
level1 GO:0048311
level2 GO:0000001
level1 GO:0022411
level2 GO:1903008
level3 GO:0090166
level1 GO:0016043
level2 GO:0006996
level3 GO:0048308
level4 GO:0000001
level4 GO:0048309
level4 GO:0048313
level3 GO:0007029
level4 GO:0048309
level3 GO:0007030
level4 GO:0048313
level4 GO:0090166
level3 GO:1903008
level4 GO:0090166
this is my file
id GO_ID
OGOAOAJO_00443 GO:0006996
OGOAOAJO_00021 GO:0048313
OGOAOAJO_00021 GO:0048308
OGOAOAJO_00830 GO:0048308
OGOAOAJO_00830 GO:0048308
OGOAOAJO_02897 GO:0000001
OGOAOAJO_02897 GO:0000001
OGOAOAJO_00517 GO:0000001
OGOAOAJO_01362 GO:0000001
OGOAOAJO_02172 GO:0048309
OGOAOAJO_02172 GO:0048313
OGOAOAJO_03684 GO:0007029
OGOAOAJO_03684 GO:0000166
OGOAOAJO_03684 GO:0048309
OGOAOAJO_03684 GO:0007030
OGOAOAJO_01125 GO:0048313
OGOAOAJO_01125 GO:0048313
OGOAOAJO_00657 GO:0090166
OGOAOAJO_00223 GO:0090166
OGOAOAJO_00223 GO:0090166
OGOAOAJO_03140 GO:1903008
OGOAOAJO_03140 GO:0090166
OGOAOAJO_03647 GO:0090166
OGOAOAJO_03407 GO:0090166
OGOAOAJO_00603 GO:0048311
OGOAOAJO_00603 GO:0048311
OGOAOAJO_01401 GO:0000001
OGOAOAJO_00603 GO:0000001
my output is like thsi
GO ID Term Level 2 GO ID Frequency
GO:0006996 ['organelle organization'] GO:0016043 1
GO:0048313 ['Golgi inheritance'] GO:0048308 4
GO:0048308 ['organelle inheritance'] GO:0006996 3
GO:0000001 ['mitochondrion inheritance'] GO:0048311 6
GO:0048309 ['endoplasmic reticulum inheritance'] GO:0048308 2
GO:0007029 ['endoplasmic reticulum organization'] GO:0006996 1
GO:0007030 ['Golgi organization'] GO:0006996 1
GO:0090166 ['Golgi disassembly'] GO:1903008 6
GO:1903008 ['organelle disassembly'] GO:0022411 1
GO:0048311 unknown_term 2
but problm is that in Level 2 GO ID go:id is not coming as aspected in level 2, as it can be seen that first row Level 2 GO ID should be GO:0006996 as a level2 but it is GO:0016043 which is level1,
correct output would be
GO ID Term Level 2 GO ID Frequency
GO:0006996 ['organelle organization'] GO:0006996 1
GO:0048313 ['Golgi inheritance'] GO:0006996 4
GO:0048308 ['organelle inheritance'] GO:0006996 3
GO:0000001 ['mitochondrion inheritance'] GO:0000001 6
GO:0000001 ['mitochondrion inheritance'] GO:0006996 6
GO:0048309 ['endoplasmic reticulum inheritance'] GO:0006996 2
GO:0007029 ['endoplasmic reticulum organization'] GO:0006996 1
GO:0007030 ['Golgi organization'] GO:0006996 1
GO:0090166 ['Golgi disassembly'] GO:1903008 6
GO:0090166 ['Golgi disassembly'] GO:0006996 6
GO:1903008 ['organelle disassembly'] GO:1903008 1
GO:1903008 ['organelle disassembly'] GO:0006996 1
GO:0048311 unknown_term 2
This piece of code:
for child in term.get("children", []):
if child["id"] == go_id:
level_2_id = term["id"]
break
...may also set level2_id
when term
is a top-level term.
I would suggest a different approach and write a function that returns the possible root-to-node paths to the given id, i.e. where each path is a list of IDs where the last one in the list is the searched id. This is a more generic function and can be used to find an ID at any level in the hierarchy.
So replace find_level_2_go_id
with this function:
def find_paths_go_id(go_id, terms):
def get_ancestors(id, path):
if id not in terms or "is_a" not in terms[id]:
yield path
return
for parent in terms[id]["is_a"]:
yield from get_ancestors(parent, [parent, *path])
return get_ancestors(go_id, [go_id])
Add another function to extract a certain level from such paths:
def get_distinct_at_depth(paths, depth):
return { path[depth] if depth < len(path) else None for path in paths }
And call it like this:
for go_id, frequency in unique_go_frequency.items():
if not go_id.startswith("GO:"):
continue
term = terms.get(go_id, {})
term_name = term.get("name", "unknown_term")
# Get the Level 2 GO ID based on the hierarchy
level_2_nodes = get_distinct_at_depth(find_paths_go_id(go_id, terms), 1)
for level_2_id in level_2_nodes:
data.append({
"GO ID": go_id,
"Term": term_name,
"Level 2 GO ID": level_2_id,
"Frequency": frequency
})
In the existing parse_obo
make sure to skip the part after "!" (like in the "is_a" properties):
else:
# skip the part after "!"
current_term.setdefault(key, []).append(value.split(" !")[0])
...result in this code:
import pandas as pd
def parse_obo(file_path):
terms = {}
current_term = {}
isterm = False
with open(file_path, 'r') as f:
for line in f:
line = line.strip()
isterm = isterm or line.startswith('id:')
if isterm and ": " in line:
key, value = line.split(': ', 1)
if key == "id":
current_term = terms.setdefault(value, {})
current_term["id"] = value.split()[0] # only get id
else:
# skip the part after "!"
current_term.setdefault(key, []).append(value.split(" !")[0])
return terms
def make_hierarchy(terms): # not used
for term in list(terms.values()):
term.setdefault("children", [])
if "is_a" in term:
for is_a in term["is_a"]:
parent = is_a.split()[0]
terms.setdefault(parent, { 'id': parent }).setdefault("children", []).append(term)
return [term for term in terms.values() if "is_a" not in term]
def calculate_go_frequency(file_path):
go_frequency = {}
with open(file_path, 'r') as f:
for line in f:
_, go_id = line.strip().split('\t')
go_frequency[go_id] = go_frequency.get(go_id, 0) + 1
return go_frequency
def find_paths_go_id(go_id, terms):
def get_ancestors(id, path):
if id not in terms or "is_a" not in terms[id]:
yield path
return
for parent in terms[id]["is_a"]:
yield from get_ancestors(parent, [parent, *path])
return get_ancestors(go_id, [go_id])
def get_distinct_at_depth(paths, depth):
return { path[depth] if depth < len(path) else None for path in paths }
if __name__ == "__main__":
file_path = 'go-basic.obo'
terms = parse_obo(file_path)
# Hierarchy is not needed?
# roots = make_hierarchy(terms)
# Read and process the unique.txt file
unique_file_path = 'unique.txt'
unique_go_frequency = calculate_go_frequency(unique_file_path)
# Create a list of dictionaries for the data
data = []
for go_id, frequency in unique_go_frequency.items():
if not go_id.startswith("GO:"):
continue
term = terms.get(go_id, {})
term_name = term.get("name", "unknown_term")
# Get the Level 2 GO ID based on the hierarchy
level_2_nodes = get_distinct_at_depth(find_paths_go_id(go_id, terms), 1)
for level_2_id in level_2_nodes:
data.append({
"GO ID": go_id,
"Term": term_name,
"Level 2 GO ID": level_2_id,
"Frequency": frequency
})
# Create a DataFrame from the data
df = pd.DataFrame(data)
pd.set_option('display.max_columns', 7)
print(df)
Running this program df
prints out as follows:
GO ID Term Level 2 GO ID Frequency
0 GO:0006996 [organelle organization] GO:0006996 1
1 GO:0048313 [Golgi inheritance] GO:0006996 4
2 GO:0048308 [organelle inheritance] GO:0006996 3
3 GO:0000001 [mitochondrion inheritance] GO:0006996 6
4 GO:0000001 [mitochondrion inheritance] GO:0000001 6
5 GO:0048309 [endoplasmic reticulum inheritance] GO:0006996 2
6 GO:0007029 [endoplasmic reticulum organization] GO:0006996 1
7 GO:0000166 unknown_term None 1
8 GO:0007030 [Golgi organization] GO:0006996 1
9 GO:0090166 [Golgi disassembly] GO:1903008 6
10 GO:0090166 [Golgi disassembly] GO:0006996 6
11 GO:1903008 [organelle disassembly] GO:1903008 1
12 GO:1903008 [organelle disassembly] GO:0006996 1
13 GO:0048311 unknown_term None 2
Note that for this you don't need the make_hierarchy
function, but maybe you need it for other purposes.