采用kofam结合kegg官网htxt进行注释
用法:
python kofam2kegg.py kofam.out ath00001.keg my_kegg_output
code:
import sys
from collections import defaultdictdef parse_kofam_file(kofam_file):ko_to_genes = defaultdict(list)with open(kofam_file) as f:for line in f:parts = line.strip().split('\t')if len(parts) == 2:gene, ko = partsko_to_genes[ko].append(gene)return ko_to_genesdef parse_keg_file(keg_file):ko_to_pathway = defaultdict(list)pathway_info = {}level1 = level2 = pathway = ''pathway_id = ''with open(keg_file) as f:for line in f:line = line.strip()if line.startswith('A'):level1 = line[1:].strip()elif line.startswith('B'):level2 = line[1:].strip()elif line.startswith('C'):parts = line.split()pathway = ' '.join(parts[1:-1])pathway_id = parts[-1].split(':')[-1]pathway_info[pathway_id] = {'Pathway': pathway,'Level1': level1,'Level2': level2}elif line.startswith('D'):parts = line.split('\t')if len(parts) == 2:ko = parts[1].split()[0]ko_to_pathway[ko].append(pathway_id)return ko_to_pathway, pathway_infodef main(kofam_file, keg_file, output_file):ko_to_genes = parse_kofam_file(kofam_file)ko_to_pathway, pathway_info = parse_keg_file(keg_file)pathway_dict = defaultdict(lambda: {'genes': set(), 'kos': set()})for ko, genes in ko_to_genes.items():if ko in ko_to_pathway:for pw_id in ko_to_pathway[ko]:pathway_dict[pw_id]['genes'].update(genes)pathway_dict[pw_id]['kos'].add(ko)with open(output_file, 'w') as out:out.write("Pathway\tGeneCount\tPathway ID\tLevel 1\tLevel 2\tKOs\tGenes\n")for pw_id, data in pathway_dict.items():info = pathway_info.get(pw_id, {})out.write(f"{info.get('Pathway', '')}\t{len(data['genes'])}\tko{pw_id}\t"f"{info.get('Level1', '')}\t{info.get('Level2', '')}\t"f"{';'.join(sorted(data['kos']))}\t{';'.join(sorted(data['genes']))}\n")if __name__ == "__main__":if len(sys.argv) != 4:print("用法: python 1.py kofam.out keg_file output_file")else:main(sys.argv[1], sys.argv[2], sys.argv[3])