python爬取代理IP并通过多线程快速验证

更新时间：2025-10-17 21:40:12作者：yezheng

在爬取一个专利查询网站的时候，反扒非常严，所以找个这样一段代码
从免费代理Ip发布网站http://www.xicidaili.com/nn/获取代理IP
通过多线程验证码代理IP是否能够正常访问目标网站
网站需要设置cookies登录

1. [代码]代理ip采集

mport urllib2

from BeautifulSoup import BeautifulSoup

# get the proxy

of = open('proxy.txt', 'w')

for page in range(1,50):

url = 'http://www.xicidaili.com/nn/%s' %page

user_agent = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"

request = urllib2.Request(url)

request.add_header("User-Agent", user_agent)

content = urllib2.urlopen(request)

soup = BeautifulSoup(content)

trs = soup.find('table', {"id":"ip_list"}).findAll('tr')

for tr in trs[1:]:

tds = tr.findAll('td')

ip = tds[2].text.strip()

port = tds[3].text.strip()

protocol = tds[6].text.strip()

if protocol == 'HTTP' or protocol == 'HTTPS':

of.write('%s=%s:%sn' % (protocol, ip, port))

print '%s://%s:%s' % (protocol, ip, port)

2. [代码]验证

import urllib2

import threading

inFile = open('proxy.txt', 'r')

outFile = open('available.txt', 'w')

url = 'http://www.lindenpat.com/search/detail/index?d=CN03819011@CN1675532A@20050928'

lock = threading.Lock()

def test():

lock.acquire()

line = inFile.readline().strip()

lock.release()

# if len(line) == 0: break

protocol, proxy = line.split('=')

cookie = "PHPSESSID=5f7mbqghvk1kt5n9illa0nr175; kmsign=56023b6880039; KMUID=ezsEg1YCOzxg97EwAwUXAg=="

try:

proxy_support = urllib2.ProxyHandler({protocol.lower():'://'.join(line.split('='))})

opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)

urllib2.install_opener(opener)

request = urllib2.Request(url)

request.add_header("cookie",cookie)

content = urllib2.urlopen(request,timeout=4).read()

if len(content) >= 1000:

lock.acquire()

print 'add proxy', proxy

outFile.write('"%s",n' %proxy)

lock.release()

else:

print '出现验证码或IP被封杀'

except Exception, error:

print error

all_thread = []

for i in range(500):

t = threading.Thread(target=test)

all_thread.append(t)

t.start()

for t in all_thread:

t.join()

inFile.close()

outFile.close()