|
本帖最后由 jsenet 于 2015-9-13 13:02 编辑
以前就弄好了,忘了放出来,懂的拿去用吧。
多线程采集,字数对比,目标站更新一版30秒内同步。
转载注明作者:vpskk.com(还未上线,嘿嘿)
- #!coding=UTF-8
- import urllib
- import urllib2
- import commands
- import time
- import threading
- import os
- import re
- import sys
- thlen = 10
- #定义同时采集的线程数
- books = []
- #定义需要采集的书库
- tsk = []
- #定义采集线程数组
- bookdict = {}
- #定义已采集图书字典,key为目标站书号,value为字数
- domain = 'yoursite.domain.com'
- adminuser = 'admin'
- adminpass = '******'
- siteid = '23'
- # notaddnew = '0'
- frompage = 'http://all.17k.com/lib/book/2_0_0_0_0_0_2_0_1.html'
- def addbooklist():
- while 1:
- time.sleep(30)
- print '[' + time.strftime('%H:%M:%S') + '] 采集更新列表线程启动。'
- start = time.time()
- try:
- response = urllib2.urlopen(frompage, timeout = 12)
- content = response.read()
- except:
- continue
- response.close()
- elapsed = (time.time() - start)
- bookattr = re.findall(r'<a class="jt" rel="/tip\.xhtml\?book\.id=([0-9]+)\&difference[^>]+>([^<]+)</a>*[\s\S]*?<td class="td5">([0-9]+)</td>',content,re.M)
- print '[' + time.strftime('%H:%M:%S') + '] 采集更新列表结束,用时:' + str(elapsed) + '秒'
- for ii in range(len(bookattr)):
- newbookid = bookattr[ii][0]
- newbookname = bookattr[ii][1]
- newbooksize = bookattr[ii][2]
- inlist = False
- for tt in range(len(books)):
- if (books[tt][0]==newbookid):
- inlist = True
- if not inlist:
- #书号不在待采集数组里
- if (newbookid in bookdict.keys()):
- #书号在已采集过的字典里(需要根据字数来判断是否有更新)
- if (int(newbooksize)>int(bookdict[newbookid])):
- #采集到书籍字数大于已采集字典里的字数(添加到待采集列表)
- books.append([newbookid,newbookname,newbooksize])
- print '书号:' + newbookid + '有更新,旧字数:'+ bookdict[newbookid] + ' 新字数:'+ newbooksize + ' 添加到待采集列表。'
- else:
- #书号不在已采集过的字典里(添加到待采集列表)
- books.append([newbookid,newbookname,newbooksize])
- print '书号:' + newbookid + '最近未采集,添加到待采集列表。'
- print '[' + time.strftime('%H:%M:%S') + '] 采集更新列表线程完成,线程休眠。'
- def caiji(bookid,bookname,booksize):
- print '正在采集 书号[' + bookid + '] 书名:' + bookname
- url = 'http://'+ domain + '/modules/article/admin/batchcollect.php?action=bcollect&siteid=' + siteid + '&batchids=' + bookid + '&jieqi_username=' + adminuser + '&jieqi_userpassword=' + adminpass
- start = time.time()
- page = urllib2.urlopen(url,timeout=3600)
- data = page.read(8192)
- while data:
- data = page.read(8192)
- page.close()
- elapsed = (time.time() - start)
- time.sleep(5) #采集完等5秒生成全书
- print '书号[' + bookid + '] 书名:' + bookname + '字数:' + booksize + 'k 采集完成! 用时:' + str(elapsed) + '秒'
- print '书号[' + bookid + '] 书名:' + bookname + '字数:' + booksize + 'k 添加到最近采集书目字典。'
- # 从网页获取要采集的文章ID和文章名字(首次)
- start = time.time()
- response = urllib2.urlopen(frompage, timeout = 12)
- content = response.read()
- response.close()
- elapsed = (time.time() - start)
- getattr = re.findall(r'<a class="jt" rel="/tip\.xhtml\?book\.id=([0-9]+)\&difference[^>]+>([^<]+)</a>*[\s\S]*?<td class="td5">([0-9]+)</td>',content,re.M)
- #getsize = re.findall(r'<td class="td5">([0-9]+)</td>',content,re.M)
- print '首次获取要采集的文章共' + str(len(getattr)) +'篇,用时:' + str(elapsed) + '秒'
- books = books + getattr
- if (len(books)<3):
- print('获取列表页失败,退出!')
- exit()
- #启动书籍列表采集线程
- thaddbooklist = threading.Thread(target=addbooklist,name='taddbooklist')
- thaddbooklist.start()
-
- for x in range(thlen):
- bookid = books[0][0]
- bookname = books[0][1]
- booksize = books[0][2]
- tname = 't' + str(x)
- th = threading.Thread(target=caiji,name=tname,args=(bookid,bookname,booksize))
- th.start()
- del books[0]
- bookdict[bookid] = booksize
- tsk.append(th)
- #检测空闲线程,当线程闲置时,若待采集列表不为空时,启用该线程进行采集
- while 1:
- time.sleep(5)
- for i in range(len(tsk)):
- if not tsk[i].is_alive():
- print tsk[i].name + '线程空闲'
- if len(books) > 0:
- bookid = books[0][0]
- bookname = books[0][1]
- booksize = books[0][2]
- th = threading.Thread(target=caiji,name=tsk[i].name,args=(bookid,bookname,booksize))
- th.start()
- del books[0]
- bookdict[bookid] = booksize
- tsk[i] = th
复制代码
效果演示:同步17k,当然,我只采集VIP小说。后台挂着,只要17k有vip小说更新,则自动同步。
caiji.png
(124.12 KB, 下载次数: 11)
|
评分
-
查看全部评分
|