python_spider 發表於 2016-03-21 | 分類於 python 此文发表在: 爬了几本书 前面自娱自乐,爬了几本书。发帖出来,有些改进意见。用多线程,用类,啥的。 改进前: 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950__author__ = 'anderson'#coding=utf-8import urllibimport reimport osdef getHtml(url): page = urllib.urlopen(url) html = page.read() return htmldef getlink(html,type): pattern = re.compile('<li> <a href="(.*?)" target="_blank">(.*?)</a>',re.S) source = re.compile(pattern) sourcelist = re.findall(source,html) for item in sourcelist: print item[0],item[1] if item[1] != r"在线阅读": #print item[0] if type in item[1]: os.popen("curl -O %s" %(item[0]))type = """ 1.pdf 2.epub 3.mobi"""print typehtml = getHtml("https://testerhome.com/topics/4419")yourtype = raw_input("please input your type: ")downtype = ""if yourtype != '': if yourtype =='1': downtype = 'PDF' elif yourtype =='2': downtype = 'EPUB' elif yourtype =='3': downtype = 'MOBI' else: print "please check your choice!"else: print "please choose one type!" getlink(html,downtype) 改进后: 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172__author__ = 'anderson'#coding=utf-8import urllibimport reimport osimport threading,timefrom time import sleep, ctimeclass get_book(): def __init__(self,nsec, url, types) : #super(myThread, self).__init__() self.types = types self.nsec = nsec self.url = url def now(self) : return str( time.strftime( '%Y-%m-%d %H:%M:%S' , time.localtime() ) ) def getHtml(self,url): page = urllib.urlopen(self.url) html = page.read() return html def getlink(self,html,types): pattern = re.compile('<li> <a href="(.*?)" target="_blank">(.*?)</a>',re.S) source = re.compile(pattern) sourcelist = re.findall(source,html) for item in sourcelist: print item[0],item[1] if self.types in item[1]: os.popen("curl -O %s" %(item[0])) sleep(self.nsec)def main(): types = {1:"PDF",2:"EPUB",3:"MOBI"} print types.items() html = "https://testerhome.com/topics/4419" yourtype = raw_input("please input your type: ") if int(yourtype) not in range (1,3): print "please select again" else: print types[int(yourtype)] get = get_book(2,html,types[int(yourtype)]) url = get.getHtml(html) print 'starting at:',get.now() threadpool=[] for i in xrange(10): th = threading.Thread(target= get.getlink,args=(url,types[int(yourtype)])) threadpool.append(th) for th in threadpool: th.start() for th in threadpool : threading.Thread.join( th ) print 'all Done at:', get.now()if __name__ == '__main__': main() 然后看看效果。