分类分类
2015-04-09 13:34作者:zhao
其实这里casperjs与python没有直接关系,主要依赖casperjs调用phantomjs webkit获取html文件内容。长期以来,爬虫抓取 客户端javascript渲染生成的html页面 都极为 困难, Java里面有 HtmlUnit, 而Python里,我们可以使用独立的跨平台的CasperJS。
创建site.js(接口文件,输入:url,输出:html file)
//USAGE:E:toolkitn1k0-casperjs-e3a77d0bin>pythoncasperjssite.js--url=http://spys.ru/free-proxy-list/IE/--outputfile='temp.html'
varfs=require('fs');
varcasper=require('casper').create({
pageSettings:{
loadImages:false,
loadPlugins:false,
userAgent:'Mozilla/5.0(WindowsNT6.1)AppleWebKit/537.36(KHTML,likeGecko)Chrome/34.0.1847.137Safari/537.36LBBROWSER'
},
logLevel:"debug",//日志等级
verbose:true//记录日志到控制台
});
varurl=casper.cli.raw.get('url');
varoutputfile=casper.cli.raw.get('outputfile');
//请求页面
casper.start(url,function(){
fs.write(outputfile,this.getHTML(),'w');
});
casper.run();
python 代码, checkout_proxy.py
importjson
importsys
#importrequests
#importrequests.utils,pickle
frombs4importBeautifulSoup
importos.path,os
importthreading
#frommultiprocessingimportProcess,Manager
fromdatetimeimportdatetime
importtraceback
importlogging
importre,random
importsubprocess
importshutil
importplatform
output_file=os.path.join(os.path.dirname(os.path.realpath(__file__)),'proxy.txt')
global_log='http_proxy'+datetime.now().strftime('%Y-%m-%d')+'.log'
ifnotos.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)),'logs')):
os.mkdir(os.path.join(os.path.dirname(os.path.realpath(__file__)),'logs'))
global_log=os.path.join(os.path.dirname(os.path.realpath(__file__)),'logs',global_log)
logging.basicConfig(level=logging.DEBUG,format='[%(asctime)s][%(levelname)s][%(module)s][%(funcName)s][%(lineno)d]%(message)s',filename=global_log,filemode='a')
log=logging.getLogger(__name__)
#manager=Manager()
#PROXY_LIST=manager.list()
mutex=threading.Lock()
PROXY_LIST=[]
defisWindows():
if"Windows"instr(platform.uname()):
returnTrue
else:
returnFalse
defgetTagsByAttrs(tagName,pageContent,attrName,attrRegValue):
soup=BeautifulSoup(pageContent)
returnsoup.find_all(tagName,{attrName:re.compile(attrRegValue)})
defgetTagsByAttrsExt(tagName,filename,attrName,attrRegValue):
ifos.path.isfile(filename):
f=open(filename,'r')
soup=BeautifulSoup(f)
f.close()
returnsoup.find_all(tagName,{attrName:re.compile(attrRegValue)})
else:
returnNone
classSite1Thread(threading.Thread):
def__init__(self,outputFilePath):
threading.Thread.__init__(self)
self.outputFilePath=outputFilePath
self.fileName=str(random.randint(100,1000))+".html"
self.setName('Site1Thread')
defrun(self):
site1_file=os.path.join(os.path.dirname(os.path.realpath(__file__)),'site.js')
site2_file=os.path.join(self.outputFilePath,'site.js')
ifnotos.path.isfile(site2_file)andos.path.isfile(site1_file):
shutil.copy(site1_file,site2_file)
#proc=subprocess.Popen(["bash","-c","cd%s&&./casperjssite.js--url=http://spys.ru/free-proxy-list/IE/--outputfile=%s"%(self.outputFilePath,self.fileName)],stdout=subprocess.PIPE)
ifisWindows():
proc=subprocess.Popen(["cmd","/c","%s/casperjssite.js--url=http://spys.ru/free-proxy-list/IE/--outputfile=%s"%(self.outputFilePath,self.fileName)],stdout=subprocess.PIPE)
else:
proc=subprocess.Popen(["bash","-c","cd%s&&./casperjssite.js--url=http://spys.ru/free-proxy-list/IE/--outputfile=%s"%(self.outputFilePath,self.fileName)],stdout=subprocess.PIPE)
out=proc.communicate()[0]
htmlFileName=''
#因为输出路径在windows不确定,所以这里加了所有可能的路径判断
ifos.path.isfile(self.fileName):
htmlFileName=self.fileName
elifos.path.isfile(os.path.join(self.outputFilePath,self.fileName)):
htmlFileName=os.path.join(self.outputFilePath,self.fileName)
elifos.path.isfile(os.path.join(os.path.dirname(os.path.realpath(__file__)),self.fileName)):
htmlFileName=os.path.join(os.path.dirname(os.path.realpath(__file__)),self.fileName)
if(notos.path.isfile(htmlFileName)):
print'Failedtogethtmlcontentfromhttp://spys.ru/free-proxy-list/IE/'
printout
sys.exit(3)
mutex.acquire()
PROXYList=getTagsByAttrsExt('font',htmlFileName,'class','spy14$')
forproxyinPROXYList:
tdContent=proxy.renderContents()
lineElems=re.split('[<>]',tdContent)
ifre.compile(r'd+').search(lineElems[-1])andre.compile('(d+.d+.d+)').search(lineElems[0]):
printlineElems[0],lineElems[-1]
PROXY_LIST.append("%s:%s"%(lineElems[0],lineElems[-1]))
mutex.release()
try:
ifos.path.isfile(htmlFileName):
os.remove(htmlFileName)
except:
pass
if__name__=='__main__':
try:
if(len(sys.argv))<2:
print"Usage:%s[casperjspath]"%(sys.argv[0])
sys.exit(1)
ifnotos.path.exists(sys.argv[1]):
print"casperjspath:%sdoesnotexist!"%(sys.argv[1])
sys.exit(2)
ifos.path.isfile(output_file):
f=open(output_file)
lines=f.readlines()
f.close
forlineinlines:
PROXY_LIST.append(line.strip())
thread1=Site1Thread(sys.argv[1])
thread1.start()
thread1.join()
f=open(output_file,'w')
forproxyinset(PROXY_LIST):
f.write(proxy+"n")
f.close()
print"Done!"
exceptSystemExit:
pass
except:
errMsg=traceback.format_exc()
printerrMsg
log.error(errMsg)
相关