代理我就不介绍了..代理简介和类型可以参考proxy代理类型:透明代理 匿名代理 混淆代理和高匿代理. 这里写一些python爬虫使用代理的知识, 还有一个代理池的类. 附带一些看到的帖子中用urllib构建的示例.
如果要测试代理是否成功, 抓http://icanhazip.com 这个网站看内容就知道了.
urllib 模块使用代理
urllib/urllib2使用代理比较麻烦, 需要先构建一个ProxyHandler
的类, 随后将该类用于构建网页打开的opener的类,再在request中安装该opener.
代理格式是"http://112.25.41.136:80"
,如果要账号密码是"http://user:password@112.25.41.136:80"
.
proxy="http://112.25.41.136:80"
# Build ProxyHandler object by given proxy
proxy_support=urllib.request.ProxyHandler({'http':proxy})
# Build opener with ProxyHandler object
opener = urllib.request.build_opener(proxy_support)
# Install opener to request
urllib.request.install_opener(opener)
# Open url
r = urllib.request.urlopen('http://icanhazip.com',timeout = 1000)
requests 模块 使用代理
requests使用代理要比urllib简单多了…这里以单次代理为例. 多次的话可以用session一类构建.
如果需要使用代理,你可以通过为任意请求方法提供 proxies 参数来配置单个请求:
import requests
proxies = {
"http": "http://10.10.1.10:3128",
"https": "http://10.10.1.10:1080",
}
r=requests.get("http://icanhazip.com", proxies=proxies)
print r.text
你也可以通过环境变量 HTTP_PROXY
和 HTTPS_PROXY
来配置代理。
export HTTP_PROXY="http://10.10.1.10:3128"
export HTTPS_PROXY="http://10.10.1.10:1080"
python
>>> import requests
>>> r=requests.get("http://icanhazip.com")
>>> print r.text
若你的代理需要使用HTTP Basic Auth,可以使用 http://user:password@host/
语法:
proxies = {
"http": "http://user:pass@10.10.1.10:3128/",
}
示例脚本
这里以gatherproxy的高匿代理为例构建一个代理池的类.别的如西刺代理同理构建.
#! /usr/bin/env python
# -*- coding: utf-8 -*-
__author__="Platinhom"
__date__="2016.1.29 23:30"
import re,requests,random
header={'headers':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
class GatherProxy(object):
'''To get proxy from http://gatherproxy.com/'''
url='http://gatherproxy.com/proxylist'
pre1=re.compile(r'<tr.*?>(?:.|\n)*?</tr>')
pre2=re.compile(r"(?<=\(\').+?(?=\'\))")
def getelite(self,pages=1,uptime=70,fast=True):
'''Get Elite Anomy proxy
Pages define how many pages to get
Uptime define the uptime(L/D)
fast define only use fast proxy with short reponse time'''
proxies=set()
for i in range(1,pages+1):
params={"Type":"elite","PageIdx":str(i),"Uptime":str(uptime)}
r=requests.post(self.url+"/anonymity/t=Elite",params=params,headers=header)
for td in self.pre1.findall(r.text):
if fast and 'center fast' not in td:
continue
try:
tmp= self.pre2.findall(str(td))
if(len(tmp)==2):
proxies.add(tmp[0]+":"+str(int('0x'+tmp[1],16)))
except:
pass
return proxies
class ProxyPool(object):
'''A proxypool class to obtain proxy'''
gatherproxy=GatherProxy()
def __init__(self):
self.pool=set()
def updateGatherProxy(self,pages=1,uptime=70,fast=True):
'''Use GatherProxy to update proxy pool'''
self.pool.update(self.gatherproxy.getelite(pages=pages,uptime=uptime,fast=fast))
def removeproxy(self,proxy):
'''Remove a proxy from pool'''
if (proxy in self.pool):
self.pool.remove(proxy)
def randomchoose(self):
'''Random Get a proxy from pool'''
if (self.pool):
return random.sample(self.pool,1)[0]
else:
self.updateGatherProxy()
return random.sample(self.pool,1)[0]
def getproxy(self):
'''Get a dict format proxy randomly'''
proxy=self.randomchoose()
proxies={'http':'http://'+proxy,'https':'https://'+proxy}
#r=requests.get('http://icanhazip.com',proxies=proxies,timeout=1)
try:
r=requests.get('http://dx.doi.org',proxies=proxies,timeout=1)
if (r.status_code == 200 ):
return proxies
else:
self.removeproxy(proxy)
return self.getproxy()
except:
self.removeproxy(proxy)
return self.getproxy()
实例2: urllib代理刷CSDN博客(转载)
转载自FadeTrack 的 Python爬虫入门 《下》. 使用的是西刺代理作为代理的源.
# 刷 CSDN 博客访问量
import urllib.request
import re,random
from multiprocessing.dummy import Pool as ThreadPool
time_out = 3 # 全局变量 10 秒超时时间
count = 0
proxies = [None]
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'}
def get_proxy():
# 使用全局变量,修改之
global proxies
try:
req = urllib.request.Request('http://www.xicidaili.com/',None,headers)
except:
print('无法获取代理信息!')
return
response = urllib.request.urlopen(req)
html = response.read().decode('utf-8')
p = re.compile(r'''<tr\sclass[^>]*>\s+
<td>.+</td>\s+
<td>(.*)?</td>\s+
<td>(.*)?</td>\s+
<td>(.*)?</td>\s+
<td>(.*)?</td>\s+
<td>(.*)?</td>\s+
<td>(.*)?</td>\s+
</tr>''',re.VERBOSE)
proxy_list = p.findall(html)
for each_proxy in proxy_list[1:]:
if each_proxy[4] == 'HTTP':
proxies.append(each_proxy[0]+':'+each_proxy[1])
def change_proxy():
# 随机从序列中取出一个元素
proxy = random.choice(proxies)
# 判断元素是否合理
if proxy == None:
proxy_support = urllib.request.ProxyHandler({})
else:
proxy_support = urllib.request.ProxyHandler({'http':proxy})
opener = urllib.request.build_opener(proxy_support)
opener.addheaders = [('User-Agent',headers['User-Agent'])]
urllib.request.install_opener(opener)
print('智能切换代理:%s' % ('本机' if proxy==None else proxy))
def get_req(url):
# 先伪造一下头部吧,使用字典
blog_eader = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36',
'Host':'blog.csdn.net',
'Referer':'http://blog.csdn.net/',
'GET':url
}
req = urllib.request.Request(url,headers = blog_eader)
return req
# 访问 博客
def look_blog(url):
# 切换一下IP
change_proxy()
req = get_req(url)
try:
urllib.request.urlopen(req,timeout = time_out)
except:
return
else:
print('访问成功!')
# 迭代访问
def click_blog(url):
for i in range(0,count):
if(i == count):
break
print('当前访问 Blog %s 第 %d 次' % (url,i))
look_blog(url)
# 获取博客的文章链表
def get_blog_list(url):
req = get_req(url)
try:
response = urllib.request.urlopen(req,timeout = time_out)
except:
print('无法挽回的错误')
return None
# 由于 Csdn 是 utf-8 所以不需要转码
html = response.read()
# 存储一个正则表达式 规则
regx = '<span class="link_title"><a href="(.+?)">'
pat = re.compile(regx)
# 其实这里 写作 list1 = re.findall('<span class="link_title"><a href="(.+?)">',str(html)) 也是一样的结果
blog_list = re.findall(pat,str(html))
return blog_list
if __name__ == '__main__':
global count
# 基本参数初始化
# 获取代理
get_proxy()
print('有效代理个数为 : %d' % len(proxies))
blogurl = input('输入blog链接:')
# 这个地方原本是我的默认输入偷懒用的
if len(blogurl) == 0:
blogurl = 'http://blog.csdn.net/bkxiaoc/'
print('博客地址是:%s' % blogurl)
try:
count = int(input('输入次数:'))
except ValueError:
print('参数错误')
quit()
if count == 0 or count > 999:
print('次数过大或过小')
quit()
print('次数确认为 %d' % count)
# 获取 博文 列表,由于测试时我的博文只有一页所以 只能获得一页的列表
blog_list = get_blog_list(blogurl + '?viewmode=contents')
if len(blog_list) == 0:
print('未找到Blog列表')
quit()
print('启动!!!!!!!!!!!!!!!!!!!!')
# 迭代一下 使用多线程
index = 0
for each_link in blog_list:
# 补全头部
each_link = 'http://blog.csdn.net' + each_link
blog_list[index] = each_link
index += 1
# 有多少个帖子就开多少个线程的一半 let's go
pool = ThreadPool(int(len(blog_list) / 2))
results = pool.map(click_blog, blog_list)
pool.close()
pool.join()
print('完成任务!!!!!!!!!!!!!!!!!!!!')