|
同道中人啊。。。。笔趣阁。。。。。。
- #!coding=UTF-8
- #自动采集脚本,siteid=24 笔趣阁
- import urllib
- import urllib2
- import commands
- import time
- import threading
- import os
- import re
- import sys
- import subprocess
- import MySQLdb
- thlen = 30
- #定义同时采集的线程数
- books = []
- #定义需要采集的书库
- tsk = []
- #定义采集线程数组
- #定义数据库连接参数
- mysqlhost = 'localhost'
- mysqluser = 'jieqi'
- mysqlpass = 'jieqi123'
- mysqldb = 'jieqi'
- mysqlcharset = 'gbk'
- mysqlsocket = '/tmp/mysql.sock'
- #定义采集后台信息
- domain = 'cj.jieqixxx.com'
- adminuser = 'admin'
- adminpass = 'mypassword'
- siteid = '24'
- # notaddnew = '0'
- #定义来源站点信息
- fromcharset = 'gbk'
- frompages = [
- ['http://www.biquge.la/',30],
- ]
- #定义获取书号、书名、最新章节、作者的正则语句
- regstr = r'<span class="s2"><a href="/book/([0-9]+)/" target="_blank">([^<]+)</a></span><span class="s3"><a [^>]+>([^<]+)</a></span><span class="s4">([^<]+)</span>'
- def cth():
- print "启动空闲线程用以采集...成功"
- def addbooklist(page,sleeptime):
- while 1:
- try:
- print '[' + time.strftime('%H:%M:%S') + '] 采集更新列表线程启动。'
- start = time.time()
- response = urllib2.urlopen(page, timeout = 12)
- content = response.read().decode(fromcharset).encode('utf8')#转换采集内容编码为python页面编码
- response.close()
- elapsed = (time.time() - start)
- bookattr = re.findall(regstr,content,re.M)
- print '[' + time.strftime('%H:%M:%S') + '] 采集更新列表结束,用时:' + str(elapsed) + '秒'
- for ii in range(len(bookattr)):
- newbookid = bookattr[ii][0].strip().rstrip('.').decode('utf8')#content已经是页面编码(utf8)了,所以应该从utf-8转成unicode
- newbookname = bookattr[ii][1].strip().rstrip('.').decode('utf8')
- newbooklastchapter = bookattr[ii][2].strip().rstrip('.').decode('utf8')
- newbookauthor = bookattr[ii][3].strip().rstrip('.').decode('utf8')
- inlist = False
- for tt in range(len(books)):
- if (books[tt][0]==newbookid):
- inlist = True
-
- needcj = False #将是否需要采集设置为否
- reason = ''
- if inlist:#在待采集列表
- needcj = False #已经在待采集列表了
- reason = '该书已经在待采集列表了,不添加'
- else:#不在带采集列表
- conn=MySQLdb.connect(host=mysqlhost,user=mysqluser,passwd=mysqlpass,db=mysqldb,charset=mysqlcharset,unix_socket=mysqlsocket)
- cur=conn.cursor()
- count=cur.execute('select `lastchapter`,`setting` from `jieqi_article_article` where `articlename` REGEXP "^' + newbookname.encode(mysqlcharset) + '" and `author` REGEXP "^' + newbookauthor.encode(mysqlcharset) + '"')
- if count <=0:#本站没有该书
- needcj = True #需要采集
- reason = '本站无该书,加入待采集列表。'
- cur.close()
- conn.close()
- else:#本站有该书(根据书名和作者判断)
- result = cur.fetchone()#从本站数据库获取该书的最新章节和来源站点ID以及来源书号
- cur.close()
- conn.close()
- lastchapter = result[0]
- setting = result[1]
- if ('a:' in setting):#该书有setting记录,进行比对
- fromsiteid = re.search('"fromsite";s:[0-9]+:"([^"]*)"',setting).group(1)
- fromarticleid = re.search('"fromarticle";s:[0-9]+:"([^"]*)"',setting).group(1)
- if (fromsiteid == siteid) and (fromarticleid == newbookid):#来源一致(来源站点id和来源书号判断)
- if (newbooklastchapter in lastchapter):#最新章节一致,表示无更新,无需采集
- needcj = False
- reason = '来源一致,且最新章节一致,无更新,不加入。'
- else:#最新章节不一致,表示有更新,需要采集
- needcj = True
- reason = '来源一致,且最新章节不一致,有更新,加入待采集列表。'
- else:#来源不一致
- needcj = False#无需采集,避免章节出错
- reason = '来源不一致,不加入。'
- else:#该书上次采集未完成,没有setting记录
- needcj = True #需要采集
- reason = '该书上次采集未完成被中断,加入待采集列表。'
- if needcj:
- books.append((newbookid.encode(mysqlcharset),newbookname.encode(mysqlcharset)))
- print '书号:' + newbookid.encode('utf8') + ' 书名:' + newbookname.encode('utf8') + reason
- print '[' + time.strftime('%H:%M:%S') + '] 采集更新列表线程完成,线程休眠。'
- time.sleep(sleeptime)
- except:
- info=sys.exc_info()
- print info[0],":",info[1]
- continue
- def caiji(bookid,bookname):
- print '正在采集 书号[' + bookid.decode(mysqlcharset).encode('utf8') + '] 书名:' + bookname.decode(mysqlcharset).encode('utf8')
- url = 'http://'+ domain + '/modules/article/admin/batchcollect.php?action=bcollect&siteid=' + siteid + '&batchids=' + bookid + '&jieqi_username=' + adminuser + '&jieqi_userpassword=' + adminpass
- start = time.time()
- #page = urllib2.urlopen(url,timeout=3600)
- #data = page.read(8192)
- #while data:
- # data = page.read(8192)
- #page.close()
-
- content = ''
- p = subprocess.Popen('wget -qO - "' + url + '"', shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
- for line in p.stdout.readlines():
- content += line.decode('gbk').encode('utf8')
- retval = p.wait()
- reclean = re.search('<a href="([^"]+)">点击这里清空小说',content)
- if reclean:
- cleanlink = 'http://'+ domain + reclean.group(1) + '&jieqi_username=' + adminuser + '&jieqi_userpassword=' + adminpass
- print '章节不对应,需要清空重采。'
- content = ''
- p = subprocess.Popen('wget -qO - "' + cleanlink + '"', shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
- for line in p.stdout.readlines():
- content += line.decode('gbk').encode('utf8')
- retval = p.wait()
- recj = re.search("url=([^']+)'>",content)
- if recj:
- cjlink = 'http://'+ domain + recj.group(1) + '&jieqi_username=' + adminuser + '&jieqi_userpassword=' + adminpass
- print '清空成功,重采ing......'
- content = ''
- p = subprocess.Popen('wget -qO - "' + cjlink + '"', shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
- for line in p.stdout.readlines():
- content += line.decode('gbk').encode('utf8')
- retval = p.wait()
-
- elapsed = (time.time() - start)
- time.sleep(5)
- print '书号[' + bookid.decode(mysqlcharset).encode('utf8') + '] 书名:' + bookname.decode(mysqlcharset).encode('utf8') + '采集完成! 用时:' + str(elapsed) + '秒'
- #创建采集线程(空闲)
- for x in range(thlen):
- tname = 't' + str(x)
- th = threading.Thread(target=cth,name=tname)
- th.start()#启动一个采集线程
- tsk.append(th)
-
- #启动书籍列表采集线程
- for pp in range(len(frompages)):
- thaddbooklist = threading.Thread(target=addbooklist,name='taddbooklist',args=(frompages[pp][0],frompages[pp][1]))
- thaddbooklist.start()
- #检测采集用的空闲线程,当线程闲置时,若待采集列表不为空时,启用该线程进行采集
- while 1:
- #try:
- #线程名使用书号,检测线程空闲,获取线程名,即为上次采集的书号
- time.sleep(8)
- for i in range(len(tsk)):
- if not tsk[i].is_alive():#线程空闲
- print 'T' + str(i) + '线程空闲,上次采集书号: ' + tsk[i].name
- for k in range(len(books)): #线程空闲,采集结束,从待采集列表删除采集完的书
- if books[k][0] == tsk[i].name:
- del books[k]#根据索引删除
- break#删除后记得跳出循环,不然下标要超
- #根据内容删除books.remove(book)
- for k in range(len(books)):#循环查找没有在采集的书号
- if len(books[k]) == 2:#元素个数等于2 说明该书没有在采集(bookid,bookname)
- bookid = books[k][0]
- bookname = books[k][1]
- th = threading.Thread(target=caiji,name=bookid,args=(bookid,bookname))
- th.start()
- books[k] = books[k] + ("1",)#将待采集列表中的该书Tuple增加一个元素,来判断是否在采集中
- tsk[i] = th
- break;#启动线程后必须退出该循环
- #except:
- # continue
-
复制代码
|
|