添加代理池,修复503错误

lzjun567 · GazEoD · Nov 14, 2017 · Nov 14, 2017 · Nov 14, 2017 · 49c69beee41baf04f0406038b15a8a3856e7d120
commit 49c69beee41baf04f0406038b15a8a3856e7d120
diff --git a/.DS_Store b/.DS_Store
diff --git a/pdf/README.md b/pdf/README.md
@@ -24,6 +24,8 @@ $ sudo yum intsall wkhtmltopdf      # centos
 
 ### 运行
 ```python
+python get_proxy.py
+python censor.py
 python crawler.py
 ```
 
@@ -42,7 +44,7 @@ python crawler.py
 ### 更新记录
 
 * 2017-2-21: 对代码进行了全面的重构,可扩展, 子类爬虫只需实现 `parse_menu`和`parse_body`方法就可以实现HTML转换PDF的逻辑
-
+* 2017-11-14: 添加代理池，修复503错误
 
 ### Contact me
 

diff --git a/pdf/crawler.py b/pdf/crawler.py
@@ -3,13 +3,12 @@
 
 import logging
 import os
+import random
 import re
 import time
+from urllib import request
 
-try:
-    from urllib.parse import urlparse  # py3
-except:
-    from urlparse import urlparse  # py2
+from urllib.parse import urlparse
 
 import pdfkit
 import requests
@@ -25,7 +24,6 @@
 {content}
 </body>
 </html>
-
 """
 
 
@@ -51,7 +49,26 @@ def request(url, **kwargs):
         网络请求,返回response对象
         :return:
         """
-        response = requests.get(url, **kwargs)
+        fp = open('proxies.txt', 'r')
+        ips = fp.readlines()
+        proxys = list()
+        for p in ips:
+            ip = p.strip('\n').split('\t')
+            pro = dict()
+            pro['https'] = ip[0] + ':' + ip[1]
+            proxys.append(pro)
+
+        headers = {
+            'user-agent': 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'}
+        while (1):
+            try:
+                response = requests.get(url, headers=headers, proxies=random.choice(proxys), timeout=2)
+                if (response.status_code != 200): continue
+                break
+            except Exception as e:
+                print(e)
+
+        print("response:", response)
         return response
 
     def parse_menu(self, response):
@@ -70,6 +87,7 @@ def parse_body(self, response):
 
     def run(self):
         start = time.time()
+
         options = {
             'page-size': 'Letter',
             'margin-top': '0.75in',
@@ -93,7 +111,6 @@ def run(self):
             with open(f_name, 'wb') as f:
                 f.write(html)
             htmls.append(f_name)
-
         pdfkit.from_file(htmls, self.name + ".pdf", options=options)
         for html in htmls:
             os.remove(html)
@@ -112,10 +129,12 @@ def parse_menu(self, response):
         :param response 爬虫返回的response对象
         :return: url生成器
         """
-        soup = BeautifulSoup(response.content, "html.parser")
-        menu_tag = soup.find_all(class_="uk-nav uk-nav-side")[1]
-        for li in menu_tag.find_all("li"):
-            url = li.a.get("href")
+        soup = BeautifulSoup(response.text, "html.parser")
+        menu_tag = soup.find_all('ul', class_="uk-nav uk-nav-side")[1]
+        # print(menu_tag.find_all('a')[0].get("href"))
+        for li in menu_tag.find_all("a"):
+            url = li.get("href")
+            # print(li)
             if not url.startswith("http"):
                 url = "".join([self.domain, url])  # 补全为全路径
             yield url
@@ -127,9 +146,10 @@ def parse_body(self, response):
         :return: 返回处理后的html文本
         """
         try:
-            soup = BeautifulSoup(response.content, 'html.parser')
-            body = soup.find_all(class_="x-wiki-content")[0]
+            soup = BeautifulSoup(response.text, 'html.parser')
+            body = soup.find_all(class_="x-wiki-content x-main-content")[0]
 
+            print("body:", body)
             # 加入标题, 居中显示
             title = soup.find('h4').get_text()
             center_tag = soup.new_tag("center")
@@ -158,6 +178,6 @@ def func(m):
 
 
 if __name__ == '__main__':
-    start_url = "http://www.liaoxuefeng.com/wiki/0013739516305929606dd18361248578c67b8067c8c017b000"
+    start_url = "https://www.liaoxuefeng.com/wiki/0013739516305929606dd18361248578c67b8067c8c017b000"
     crawler = LiaoxuefengPythonCrawler("廖雪峰Git", start_url)
     crawler.run()