Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
添加代理池,修复503错误
  • Loading branch information
wonder authored and wonder committed Nov 14, 2017
commit 49c69beee41baf04f0406038b15a8a3856e7d120
Binary file modified .DS_Store
Binary file not shown.
4 changes: 3 additions & 1 deletion pdf/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ $ sudo yum intsall wkhtmltopdf # centos

### 运行
```python
python get_proxy.py
python censor.py
python crawler.py
```

Expand All @@ -42,7 +44,7 @@ python crawler.py
### 更新记录

* 2017-2-21: 对代码进行了全面的重构,可扩展, 子类爬虫只需实现 `parse_menu`和`parse_body`方法就可以实现HTML转换PDF的逻辑

* 2017-11-14: 添加代理池,修复503错误

### Contact me

Expand Down
48 changes: 34 additions & 14 deletions pdf/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,12 @@

import logging
import os
import random
import re
import time
from urllib import request

try:
from urllib.parse import urlparse # py3
except:
from urlparse import urlparse # py2
from urllib.parse import urlparse

import pdfkit
import requests
Expand All @@ -25,7 +24,6 @@
{content}
</body>
</html>

"""


Expand All @@ -51,7 +49,26 @@ def request(url, **kwargs):
网络请求,返回response对象
:return:
"""
response = requests.get(url, **kwargs)
fp = open('proxies.txt', 'r')
ips = fp.readlines()
proxys = list()
for p in ips:
ip = p.strip('\n').split('\t')
pro = dict()
pro['https'] = ip[0] + ':' + ip[1]
proxys.append(pro)

headers = {
'user-agent': 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'}
while (1):
try:
response = requests.get(url, headers=headers, proxies=random.choice(proxys), timeout=2)
if (response.status_code != 200): continue
break
except Exception as e:
print(e)

print("response:", response)
return response

def parse_menu(self, response):
Expand All @@ -70,6 +87,7 @@ def parse_body(self, response):

def run(self):
start = time.time()

options = {
'page-size': 'Letter',
'margin-top': '0.75in',
Expand All @@ -93,7 +111,6 @@ def run(self):
with open(f_name, 'wb') as f:
f.write(html)
htmls.append(f_name)

pdfkit.from_file(htmls, self.name + ".pdf", options=options)
for html in htmls:
os.remove(html)
Expand All @@ -112,10 +129,12 @@ def parse_menu(self, response):
:param response 爬虫返回的response对象
:return: url生成器
"""
soup = BeautifulSoup(response.content, "html.parser")
menu_tag = soup.find_all(class_="uk-nav uk-nav-side")[1]
for li in menu_tag.find_all("li"):
url = li.a.get("href")
soup = BeautifulSoup(response.text, "html.parser")
menu_tag = soup.find_all('ul', class_="uk-nav uk-nav-side")[1]
# print(menu_tag.find_all('a')[0].get("href"))
for li in menu_tag.find_all("a"):
url = li.get("href")
# print(li)
if not url.startswith("http"):
url = "".join([self.domain, url]) # 补全为全路径
yield url
Expand All @@ -127,9 +146,10 @@ def parse_body(self, response):
:return: 返回处理后的html文本
"""
try:
soup = BeautifulSoup(response.content, 'html.parser')
body = soup.find_all(class_="x-wiki-content")[0]
soup = BeautifulSoup(response.text, 'html.parser')
body = soup.find_all(class_="x-wiki-content x-main-content")[0]

print("body:", body)
# 加入标题, 居中显示
title = soup.find('h4').get_text()
center_tag = soup.new_tag("center")
Expand Down Expand Up @@ -158,6 +178,6 @@ def func(m):


if __name__ == '__main__':
start_url = "http://www.liaoxuefeng.com/wiki/0013739516305929606dd18361248578c67b8067c8c017b000"
start_url = "https://www.liaoxuefeng.com/wiki/0013739516305929606dd18361248578c67b8067c8c017b000"
crawler = LiaoxuefengPythonCrawler("廖雪峰Git", start_url)
crawler.run()