python_spider

此文发表在：爬了几本书

前面自娱自乐，爬了几本书。
发帖出来，有些改进意见。
用多线程，用类，啥的。

改进前：

__author__ = 'anderson'


#coding=utf-8
import urllib
import re
import os

def getHtml(url):
    page = urllib.urlopen(url)
    html = page.read()
    return html

def getlink(html,type):
    pattern = re.compile('<li>  <a href="(.*?)" target="_blank">(.*?)</a>',re.S)
    source = re.compile(pattern)
    sourcelist = re.findall(source,html)
    for item in sourcelist:
        print item[0],item[1]
        if item[1] != r"在线阅读":
            #print item[0]
            if type in item[1]:
                os.popen("curl -O %s" %(item[0]))

type = """
    1.pdf
    2.epub
    3.mobi"""

print type

html = getHtml("https://testerhome.com/topics/4419")


yourtype = raw_input("please input your type: ")

downtype = ""
if yourtype != '':
    if yourtype =='1':
        downtype = 'PDF'
    elif yourtype =='2':
        downtype = 'EPUB'
    elif yourtype =='3':
        downtype = 'MOBI'
    else:
        print "please check your choice!"
else:
    print "please choose one type!"        

getlink(html,downtype)

改进后：

__author__ = 'anderson'


#coding=utf-8
import urllib
import re
import os
import threading,time
from time import sleep, ctime

class get_book():

    def __init__(self,nsec, url, types) :
          #super(myThread, self).__init__()
          self.types = types
          self.nsec = nsec
          self.url = url

    def now(self) :
        return str( time.strftime( '%Y-%m-%d %H:%M:%S' , time.localtime() ) )

    def getHtml(self,url):
        page = urllib.urlopen(self.url)
        html = page.read()
        return html

    def getlink(self,html,types):
        pattern = re.compile('<li>  <a href="(.*?)" target="_blank">(.*?)</a>',re.S)
        source = re.compile(pattern)
        sourcelist = re.findall(source,html)
        for item in sourcelist:
            print item[0],item[1]
            if self.types in item[1]:
                os.popen("curl -O %s" %(item[0]))
                sleep(self.nsec)


def main():
    
    types = {1:"PDF",2:"EPUB",3:"MOBI"}
    
    print types.items()

    html = "https://testerhome.com/topics/4419"

    yourtype = raw_input("please input your type: ")

    if int(yourtype) not in range (1,3):
        print "please select again"
    else:
        print types[int(yourtype)]

        get = get_book(2,html,types[int(yourtype)])
        url = get.getHtml(html)

        print 'starting at:',get.now()
        threadpool=[]

        for i in xrange(10):
            th = threading.Thread(target= get.getlink,args=(url,types[int(yourtype)]))
            threadpool.append(th)

        for th in threadpool:
            th.start()

        for th in threadpool :
            threading.Thread.join( th )

        print 'all Done at:', get.now()

if __name__ == '__main__':
        main()

然后看看效果。