分类分类
2015-09-23 16:36作者:yezheng
在爬取一个专利查询网站的时候,反扒非常严,所以找个这样一段代码
从免费代理Ip发布网站http://www.xicidaili.com/nn/获取代理IP
通过多线程验证码代理IP是否能够正常访问目标网站
网站需要设置cookies登录
1. [代码]代理ip采集
mport urllib2
from BeautifulSoup import BeautifulSoup
# get the proxy
of = open('proxy.txt', 'w')
for page in range(1,50):
url = 'http://www.xicidaili.com/nn/%s' %page
user_agent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"
request = urllib2.Request(url)
request.add_header("User-Agent", user_agent)
content = urllib2.urlopen(request)
soup = BeautifulSoup(content)
trs = soup.find('table', {"id":"ip_list"}).findAll('tr')
for tr in trs[1:]:
tds = tr.findAll('td')
ip = tds[2].text.strip()
port = tds[3].text.strip()
protocol = tds[6].text.strip()
if protocol == 'HTTP' or protocol == 'HTTPS':
of.write('%s=%s:%sn' % (protocol, ip, port))
print '%s://%s:%s' % (protocol, ip, port)
2. [代码]验证
import urllib2
import threading
inFile = open('proxy.txt', 'r')
outFile = open('available.txt', 'w')
url = 'http://www.lindenpat.com/search/detail/index?d=CN03819011@CN1675532A@20050928'
lock = threading.Lock()
def test():
lock.acquire()
line = inFile.readline().strip()
lock.release()
# if len(line) == 0: break
protocol, proxy = line.split('=')
cookie = "PHPSESSID=5f7mbqghvk1kt5n9illa0nr175; kmsign=56023b6880039; KMUID=ezsEg1YCOzxg97EwAwUXAg=="
try:
proxy_support = urllib2.ProxyHandler({protocol.lower():'://'.join(line.split('='))})
opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)
request = urllib2.Request(url)
request.add_header("cookie",cookie)
content = urllib2.urlopen(request,timeout=4).read()
if len(content) >= 1000:
lock.acquire()
print 'add proxy', proxy
outFile.write('"%s",n' %proxy)
lock.release()
else:
print '出现验证码或IP被封杀'
except Exception, error:
print error
all_thread = []
for i in range(500):
t = threading.Thread(target=test)
all_thread.append(t)
t.start()
for t in all_thread:
t.join()
inFile.close()
outFile.close()
相关