scrapy中间件实现更好useragent,proxy

useragent,proxy

Posted by Shubei on May 7, 2019

1. middlewares.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html

import random
import json
from scrapy import signals

from scrapy.downloadermiddlewares.retry import RetryMiddleware

from common.proxypool import ProxyPool

class ProxyMiddleware(object):
    #代理proxy中间件
    from twisted.internet.error import TimeoutError, TCPTimedOutError, ConnectionDone, ConnectError, ConnectionLost

    EXCEPTIONS_TO_RETRY = (TimeoutError,ConnectionRefusedError, ConnectionDone, ConnectError,
                           ConnectionLost, TCPTimedOutError)


    def __init__(self, proxy_pool):
        self.proxy_pool = proxy_pool

    @classmethod
    def from_crawler(cls, crawler):
        proxy_pool = ProxyPool()
        proxy_pool.def_num = 1
        proxy_pool.proxy_url = 'http://api.wandoudl.com/api/ip?app_key=bd69c332749f3801e2efe63855800024&pack=204620&num=10&xy=2&type=2&lb=\r\n&mr=2&'
        proxy_pool.pool['59.63.67.229:36410'] = ("2019-05-05 11:15:26", 0)

        return cls(proxy_pool=proxy_pool)

    def process_request(self, request, spider):

        proxy = request.meta['cur_proxy'] = request.meta['proxy'] = self.proxy_pool.get_proxy()
        spider.logger.info('proxy:{p}'.format(p=proxy))

    def process_response(self, request, response, spider):
        '''处理被禁的代理ip'''

        data = json.loads(response.text).get('data')
        error = data.get('error')
        proxy = self.get_current_proxy(request)

        self.proxy_pool.set_proxy_fail_times(proxy, -1)  #代理失败次数减
        if request.meta.get('try_times', 0) >= 1:
            print('这是重试请求的响应结果,重试次数:%s' % request.meta.get('try_times', 0))

        if error and error.get('code', '') == '1004':
            self.proxy_pool.rmv_proxy(proxy)
            spider.logger.error(
                'proxy {a} have been banned,try again'.format(a=request.meta.get('cur_proxy')))

            return self._retry(request)

        else:
           pass

        return response

    def get_current_proxy(self, request):
        '''获取当前请求的代理'''
        return request.meta.get('cur_proxy')

    def process_exception(self, request, exception, spider):

        if isinstance(exception, self.EXCEPTIONS_TO_RETRY) \
                and not request.meta.get('dont_retry', False):

            spider.logger.error('exception,{},retry'.format(type(exception)))
            proxy = self.get_current_proxy(request)
            self.proxy_pool.set_proxy_fail_times(proxy, 1)  # 代理失败次数加
            return self._retry(request)

    def _retry(self, request):
        '''请求重试'''
        #
        # proxy = self.get_current_proxy(request)
        # self.proxy_pool.set_proxy_fail_times(proxy, -1)  # 代理失败次数减
        try_times = request.meta.get('try_times', 0) + 1

        if try_times > 3:
            raise ValueError('retry over 3 times,{}'.format(request.body))
        r = request.copy()
        r.dont_filter = True
        r.meta['try_times'] = try_times

        return r

class UAMiddleware(object):
    '''随机ua'''

    def __init__(self, uas):
        self.user_agents = uas

    @classmethod
    def from_crawler(cls, crawler):
        user_agents = crawler.settings.get('UA')

        if not len(user_agents):
            raise ValueError('useragent获取失败')

        return cls(uas=user_agents)

    def process_request(self, request, spider):

        ua = request.headers['user-agent'] = random.choice(self.user_agents)
        # spider.logger.info('UA:{u}'.format(u=ua))

        return None

2.settings.py中设置启用下载中间件

1
2
3
4
DOWNLOADER_MIDDLEWARES = {
    'middlewares.ProxyMiddleware': 100,
    'middlewares.UAMiddleware': 200,
}

3. 附件

UA

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
UA = [
     'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
     'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
     'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;',
     'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
     'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
     'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
     'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
     'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
     'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
     'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
     'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
     'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
     '?Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)',
     '?Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
     'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
     'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)',
     'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
     'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
     'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
     'Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
     'Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1',
     'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1',
     'Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10',
     'Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13',
     'Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+',
     'Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0',
     'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124',
     'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)',
     'NOKIA5700/ UCWEB7.0.2.37/28/999',
     'Openwave/ UCWEB7.0.2.37/28/999',
     'Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999'
     ]

proxypool.py

代理池,维护一个可用代理池

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python
# encoding: utf-8
'''
@author: xiaobei
@time: 2019/4/27 14:16
@desc: 代理池
'''
import time
import random
import datetime
import threading

from .proxy import Proxy

class ProxyPool(object):

    def __init__(self):
        self.pool = dict()  #{'ip':(expire_time,faild_times)}
        self.pool_lock = threading.Lock()  #代理池锁
        self.def_num = 1  #代理池缺省至少保留代理数
        self.proxy_url = ''
        self.lock = threading.Lock()

        self.generate_status = 0  #代理生成器状态 0空闲,1生产中
        self.generater_lock = threading.Lock()  #代理生成器锁
        self.proxy_faile_times_limit = 5 #5次为代理失败上限删除

    def get_proxy(self) ->str:
        '''获取可用代理'''

        if len(self.pool) >= self.def_num:
            proxy = random.sample(self.pool.keys(), 1)[0]

            if self.proxy_is_expired(proxy, self.pool[proxy][0]) or \
               self.get_proxy_fail_times(proxy) > self.proxy_faile_times_limit:
                self.rmv_proxy(proxy)
                return self.get_proxy()

        elif not self.generate_status:
            self.generate_status = True
            print('代理消耗完,重新获取')
            # self.lock.acquire()
            self.generater_lock.acquire()
            proxy_l = self.generate_proxy(self.proxy_url)
            for p in proxy_l:
                self.add_proxy(p['ip'], p['expire_time'])

            self.generate_status = False
            # self.lock.release()
            self.generater_lock.release()
            proxy = self.get_proxy()

        else:
            while 1:
                time.sleep(3) #休息3秒,再获取
                print('休息3秒再获取')
                return self.get_proxy()

        return proxy

    def add_proxy(self, proxy, expire_time=None) -> bool:
        '''添加代理进代理池'''

        if expire_time and self.proxy_is_expired(proxy, expire_time):
            ret = False
        else:
            self.pool_lock.acquire()
            self.pool[proxy] = (expire_time, 0)
            self.pool_lock.release()
            ret = True

        return ret

    def rmv_proxy(self, proxy) ->bool:
        '''删除代理'''

        self.pool_lock.acquire()
        if proxy in self.pool:
            del self.pool[proxy]
        self.pool_lock.release()

        return True

    def set_proxy_fail_times(self, proxy, step=0):
        '''设置代理失败次数'''

        if proxy in self.pool:
            fail_times = self.pool[proxy][1] + step
            self.pool[proxy] = (self.pool[proxy][0], fail_times if fail_times > 0 else 0)

    def get_proxy_fail_times(self, proxy):
        '''获取代理失败次数'''

        if proxy in self.pool:
            return self.pool[proxy][1]
        else:
            print('要查询的代理已被删除,{p}'.format(p=proxy))
            return 10000  #一个很大的数

    def generate_proxy(self, url) -> str:
        '''生成代理'''
        ips = Proxy.get_proxy_jiguang(url)
        # ips = Proxy.get_ip_from_vps('58.221.0.139', '20063')  # todo debug
        # ips = [{'ip': '121.226.74.21:3828',
        #         'expire_time': '2050-01-01 00:00:00'
        #         }]

        return ips

    def proxy_is_expired(self, proxy, expire_time)->bool:
        '''代理是否超时'''

        # 1分钟以内, 则丢弃
        if expire_time < (datetime.datetime.now() + datetime.timedelta(microseconds=1 * 60000000)).strftime(
                '%Y-%m-%d %H:%M:%S'):
            print('有效期1分钟以内,代理即将失效')
            return True

        else:
            return False

proxy.py

动态搭理获取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python
# encoding: utf-8
'''
@author: xiaobei
@time: 2019/3/15 14:39
@desc: 动态代理
'''

import datetime
import time
import requests
import pexpect

class Proxy():
    url_jiguang = 'http://api.wandoudl.com/api/ip?app_key=bd69c332749f3801e2efe63855800024&pack=204620&num=10&xy=2&type=2&lb=\r\n&mr=2&'
    # url_ip3366 = 'http://gec.ip3366.net/api/?key=20190218182629436&getnum=50&anonymoustype=2&area=1&order=2&formats=2&proxytype=01'
    url_ip3366 = 'http://gec.ip3366.net/api/?key=20190218182629436&getnum=100&anonymoustype=2&area=1&order=1&sarea=1&formats=2&proxytype=01'
    N = 1

    @classmethod
    def get_proxy_jiguang(cls, url=None,expire_time=None, retry_times=0,len_ips = 1):

        try:
            if url is None:
                url = cls.url_jiguang
            ipools = requests.get(url).json()
        except Exception as e:
            print('请求异常,重试:{}'.format(retry_times))
            if retry_times <= 2:  # 重试2次
                retry_times += 1
                return cls.get_proxy_jiguang(expire_time, retry_times=retry_times)
            else:
                raise e

        PROXIES = []
        for i in ipools['data']:
            expire_time = i.get('expire_time')
            print(i)
            #低于20分钟的丢弃
            # if expire_time < (datetime.datetime.now() + datetime.timedelta(microseconds= 20*60000000)).strftime('%Y-%m-%d %H:%M:%S'):
            #     continue

            # info = {'ip' :'http://' + str(i['ip']) + ':' + str(i['port']),
            #         'expire_time' : i.get('expire_time')
            #         }
            info = {'ip': str(i['ip']) + ':' + str(i['port']),
                    'expire_time': i.get('expire_time')
                    }
            PROXIES.append(info)

        while len(PROXIES) < cls.N:
            import time
            time.sleep(1)
            if url is None:
                url = cls.url_jiguang
            ipools = requests.get(url).json()

            for i in ipools['data']:
                expire_time = i.get('expire_time')

                # 低于20分钟的丢弃
                # if expire_time < (datetime.datetime.now() + datetime.timedelta(microseconds=20 * 60000000)).strftime(
                #         '%Y-%m-%d %H:%M:%S'):
                #     continue

                # info = {'ip': 'http://' + str(i['ip']) + ':' + str(i['port']),
                #         'expire_time': i.get('expire_time')
                #         }
                info = {'ip': str(i['ip']) + ':' + str(i['port']),
                        'expire_time': i.get('expire_time')
                        }
                PROXIES.append(info)
                if len(PROXIES) >= cls.N:
                    break

        return PROXIES

    @classmethod
    def get_proxy_ip3366(cls,expire_time=None,retry_times=0):

        try:
            ipools = requests.get(cls.url_ip3366).json()
        except Exception as e:
            print('proxy,请求异常,重试:{}'.format(retry_times))
            time.sleep(2)
            if retry_times <= 3: #重试3次
                retry_times += 1
                return cls.get_proxy_ip3366(expire_time,retry_times=retry_times)
            else:
                raise e

        PROXIES = []
        for i in ipools:
            # info = {'ip': 'http://' + str(i['Ip']) + ':' + str(i['Port']),
            #         'expire_time': expire_time
            #         }
            info = {'ip': str(i.get('Ip', i.get('ip'))) + ':' + str(i.get('Port', i.get('port'))),
                    'expire_time': expire_time
                    }
            PROXIES.append(info)
        return PROXIES

    @classmethod
    def get_ip_from_vps(self, host, port):
        ip_address = None
        try:
            cmd = 'ssh root@{host} -p {port} \'bash get_new_ip\''.format(host=host, port=port)
            child = pexpect.spawn(cmd)
            ret = child.expect(['assword', pexpect.TIMEOUT])
            if ret == 0:
                child.sendline('touyan123')
                ret_ip = child.expect([pexpect.EOF, pexpect.TIMEOUT])
                if ret_ip == 0:
                    print('get ip')
                    ip_address = str(child.before, encoding='utf-8').replace('\r', '').split('\n')[2] + ':3828'
                else:
                    print('no ip')
            else:
                print('no ip')
        except:
            print('exception, no ip')

        child.close()

        info = {'ip': ip_address,
                'expire_time': '2050-01-01 00:00:00'
                }

        return [info]