pdbs文件读取全部,然后下载到文件夹内的脚本
#!/usr/bin/python # coding=utf8 # 本脚本负责读取全部pdbs文件名,然后下载到文件夹内 import urllib2 import os import string import Queue import threading import time # 多线程工具(直接用) ######################################################################### class Worker(threading.Thread): # 处理工作请求 def __init__(self, workQueue, resultQueue, **kwds): threading.Thread.__init__(self, **kwds) self.setDaemon(True) self.workQueue = workQueue self.resultQueue = resultQueue def run(self): while 1: try: callable, args, kwds = self.workQueue.get(False) # get task res = callable(*args, **kwds) self.resultQueue.put(res) # put result except Queue.Empty: break class WorkManager: # 线程池管理,创建 def __init__(self, num_of_workers=10): self.workQueue = Queue.Queue() # 请求队列 self.resultQueue = Queue.Queue() # 输出结果的队列 self.workers = [] self._recruitThreads(num_of_workers) def _recruitThreads(self, num_of_workers): for i in range(num_of_workers): worker = Worker(self.workQueue, self.resultQueue) # 创建工作线程 self.workers.append(worker) # 加入到线程队列 def start(self): for w in self.workers: w.start() def wait_for_complete(self): while len(self.workers): worker = self.workers.pop() # 从池中取出一个线程处理请求 worker.join() if worker.isAlive() and not self.workQueue.empty(): self.workers.append(worker) # 重新加入线程池中 print 'All jobs were complete.' def add_job(self, callable, *args, **kwds): self.workQueue.put((callable, args, kwds)) # 向工作队列中加入请求 def get_result(self, *args, **kwds): return self.resultQueue.get(*args, **kwds) ############################################################################## # 多线程工具End # 之前写入pdbs的文件列表路径 appdrugs_db_pdbs_path = 'appdrugs_pdbs.csv' # 检测本地是否有drugDB列表文件,如果没有就退出 if not os.path.exists(appdrugs_db_pdbs_path): print(appdrugs_db_pdbs_path+" not existed") os._exit(0) # 读取之前下载的appdrugs_pdbs文件,并根据换行符拆分成数组 f = open(appdrugs_db_pdbs_path) c = f.read() f.close() appdrugs_pdbs_list = string.split(c, '\n') print("+ appdrugs_pdbs_list readed") # 需要下载到pdbs文件夹下,所以判断存不存在先 if not os.path.exists('pdbs'): os.mkdir('pdbs') # 具体每个线程需要做的工作,判断是否存在,然后下载,报告任务完成 # 使用ip地址来节省dns查询 files.rcsb.org def download_file(pdb_id, apl): url = 'http://132.249.213.140:80/view/'+pdb_id+'.pdb' pathname = 'pdbs/'+pdb_id.upper()+'.pdb' if os.path.exists(pathname): print(pdb_id+" existed, "+str(apl)+" jobs done") else: print(pdb_id+" downloading, "+str(apl)+" jobs done") file = urllib2.urlopen(url) f = open(pathname, "wb") f.write(file.read()) file.close() f.close() # 线程数,同时下载数(根据机器配置) num_of_threads = 10 _st = time.time() wm = WorkManager(num_of_threads) apl = 0 for pdb_id in appdrugs_pdbs_list: wm.add_job(download_file, pdb_id, apl) apl = apl + 1 wm.start() wm.wait_for_complete() print "total time:" print time.time() - _st #