Snake's Home

python_spider

此文发表在: 爬了几本书

前面自娱自乐,爬了几本书。
发帖出来,有些改进意见。
用多线程,用类,啥的。

改进前:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
__author__ = 'anderson'


#coding=utf-8
import urllib
import re
import os

def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html

def getlink(html,type):
pattern = re.compile('<li> <a href="(.*?)" target="_blank">(.*?)</a>',re.S)
source = re.compile(pattern)
sourcelist = re.findall(source,html)
for item in sourcelist:
print item[0],item[1]
if item[1] != r"在线阅读":
#print item[0]
if type in item[1]:
os.popen("curl -O %s" %(item[0]))

type = """
1.pdf
2.epub
3.mobi"""


print type

html = getHtml("https://testerhome.com/topics/4419")


yourtype = raw_input("please input your type: ")

downtype = ""
if yourtype != '':
if yourtype =='1':
downtype = 'PDF'
elif yourtype =='2':
downtype = 'EPUB'
elif yourtype =='3':
downtype = 'MOBI'
else:
print "please check your choice!"
else:
print "please choose one type!"

getlink(html,downtype)

改进后:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
__author__ = 'anderson'


#coding=utf-8
import urllib
import re
import os
import threading,time
from time import sleep, ctime

class get_book():

def __init__(self,nsec, url, types) :
#super(myThread, self).__init__()
self.types = types
self.nsec = nsec
self.url = url

def now(self) :
return str( time.strftime( '%Y-%m-%d %H:%M:%S' , time.localtime() ) )

def getHtml(self,url):
page = urllib.urlopen(self.url)
html = page.read()
return html

def getlink(self,html,types):
pattern = re.compile('<li> <a href="(.*?)" target="_blank">(.*?)</a>',re.S)
source = re.compile(pattern)
sourcelist = re.findall(source,html)
for item in sourcelist:
print item[0],item[1]
if self.types in item[1]:
os.popen("curl -O %s" %(item[0]))
sleep(self.nsec)


def main():

types = {1:"PDF",2:"EPUB",3:"MOBI"}

print types.items()

html = "https://testerhome.com/topics/4419"

yourtype = raw_input("please input your type: ")

if int(yourtype) not in range (1,3):
print "please select again"
else:
print types[int(yourtype)]

get = get_book(2,html,types[int(yourtype)])
url = get.getHtml(html)

print 'starting at:',get.now()
threadpool=[]

for i in xrange(10):
th = threading.Thread(target= get.getlink,args=(url,types[int(yourtype)]))
threadpool.append(th)

for th in threadpool:
th.start()

for th in threadpool :
threading.Thread.join( th )

print 'all Done at:', get.now()

if __name__ == '__main__':
main()

然后看看效果。