|
pydev测试通过,原来写的……还有192.。。。这货还要自己搞……有些编码器不支持中文,还有bs4的能够直接识别编码很快的,bs4后面的解码器lxml可以换的,lxml很费资源
- #coding=utf-8
- '''
- Created on 2015年12月3日
- @FileName: Test.py
- @Description: (描述)
- @Site: http://www.sugarguo.com/
- @author: 'Sugarguo'
- @version V1.0.0
- '''
- import sys
- reload(sys)
- sys.path.append("..")
- sys.setdefaultencoding('utf8')
- import requests
- from lxml import etree
- from bs4 import BeautifulSoup
- headers = {
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
- #'Accept-Encoding': 'gzip, deflate',
- 'Connection': 'close',
- }
- def getHtmlWithoutProxy(url):
- try:
- headers['User-Agent'] = {"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"}
- r = requests.get(url, headers=headers, timeout=5)
- print 'Encoding: ',r.encoding
- print 'Get Url :',url
- print 'Status Code: ',r.status_code
- html = r.content
- return html
- except Exception as e:
- print(e)
- print "**********[ Error! ]**********\n"
- content_html = getHtmlWithoutProxy('http://www.biquge.la/book/176/5054609.html')
- soup = BeautifulSoup(content_html,"lxml")
- print soup.find(id="content").get_text()
- list_html = getHtmlWithoutProxy('http://www.biquge.la/book/176').lower().decode('gb2312')
- list_page = etree.HTML(list_html)
- novelTitle = list_page.xpath(u'//div[@id="list"]/dl/dd/a/text()')
- for item in novelTitle:
- print item
复制代码 |
|