在Python中使用CasperJS获取JS渲染生成的HTML内容的教程

2015-04-09 13:34作者：zhao

其实这里casperjs与python没有直接关系,主要依赖casperjs调用phantomjs webkit获取html文件内容。长期以来，爬虫抓取客户端javascript渲染生成的html页面都极为困难, Java里面有 HtmlUnit, 而Python里，我们可以使用独立的跨平台的CasperJS。

创建site.js(接口文件，输入:url，输出:html file)

//USAGE:E:toolkitn1k0-casperjs-e3a77d0bin>pythoncasperjssite.js--url=http://spys.ru/free-proxy-list/IE/--outputfile='temp.html'

varfs=require('fs');

varcasper=require('casper').create({

pageSettings:{

loadImages:false,

loadPlugins:false,

userAgent:'Mozilla/5.0(WindowsNT6.1)AppleWebKit/537.36(KHTML,likeGecko)Chrome/34.0.1847.137Safari/537.36LBBROWSER'

logLevel:"debug",//日志等级

verbose:true//记录日志到控制台

});

varurl=casper.cli.raw.get('url');

varoutputfile=casper.cli.raw.get('outputfile');

//请求页面

casper.start(url,function(){

fs.write(outputfile,this.getHTML(),'w');

});

casper.run();

python 代码, checkout_proxy.py

importjson

importsys

#importrequests

#importrequests.utils,pickle

frombs4importBeautifulSoup

importos.path,os

importthreading

#frommultiprocessingimportProcess,Manager

fromdatetimeimportdatetime

importtraceback

importlogging

importre,random

importsubprocess

importshutil

importplatform

output_file=os.path.join(os.path.dirname(os.path.realpath(__file__)),'proxy.txt')

global_log='http_proxy'+datetime.now().strftime('%Y-%m-%d')+'.log'

ifnotos.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)),'logs')):

os.mkdir(os.path.join(os.path.dirname(os.path.realpath(__file__)),'logs'))

global_log=os.path.join(os.path.dirname(os.path.realpath(__file__)),'logs',global_log)

logging.basicConfig(level=logging.DEBUG,format='[%(asctime)s][%(levelname)s][%(module)s][%(funcName)s][%(lineno)d]%(message)s',filename=global_log,filemode='a')

log=logging.getLogger(__name__)

#manager=Manager()

#PROXY_LIST=manager.list()

mutex=threading.Lock()

PROXY_LIST=[]

defisWindows():

if"Windows"instr(platform.uname()):

returnTrue

else:

returnFalse

defgetTagsByAttrs(tagName,pageContent,attrName,attrRegValue):

soup=BeautifulSoup(pageContent)

returnsoup.find_all(tagName,{attrName:re.compile(attrRegValue)})

defgetTagsByAttrsExt(tagName,filename,attrName,attrRegValue):

ifos.path.isfile(filename):

f=open(filename,'r')

soup=BeautifulSoup(f)

f.close()

returnsoup.find_all(tagName,{attrName:re.compile(attrRegValue)})

else:

returnNone

classSite1Thread(threading.Thread):

def__init__(self,outputFilePath):

threading.Thread.__init__(self)

self.outputFilePath=outputFilePath

self.fileName=str(random.randint(100,1000))+".html"

self.setName('Site1Thread')

defrun(self):

site1_file=os.path.join(os.path.dirname(os.path.realpath(__file__)),'site.js')

site2_file=os.path.join(self.outputFilePath,'site.js')

ifnotos.path.isfile(site2_file)andos.path.isfile(site1_file):

shutil.copy(site1_file,site2_file)

#proc=subprocess.Popen(["bash","-c","cd%s&&./casperjssite.js--url=http://spys.ru/free-proxy-list/IE/--outputfile=%s"%(self.outputFilePath,self.fileName)],stdout=subprocess.PIPE)

ifisWindows():

proc=subprocess.Popen(["cmd","/c","%s/casperjssite.js--url=http://spys.ru/free-proxy-list/IE/--outputfile=%s"%(self.outputFilePath,self.fileName)],stdout=subprocess.PIPE)

else:

proc=subprocess.Popen(["bash","-c","cd%s&&./casperjssite.js--url=http://spys.ru/free-proxy-list/IE/--outputfile=%s"%(self.outputFilePath,self.fileName)],stdout=subprocess.PIPE)

out=proc.communicate()[0]

htmlFileName=''

#因为输出路径在windows不确定，所以这里加了所有可能的路径判断

ifos.path.isfile(self.fileName):

htmlFileName=self.fileName

elifos.path.isfile(os.path.join(self.outputFilePath,self.fileName)):

htmlFileName=os.path.join(self.outputFilePath,self.fileName)

elifos.path.isfile(os.path.join(os.path.dirname(os.path.realpath(__file__)),self.fileName)):

htmlFileName=os.path.join(os.path.dirname(os.path.realpath(__file__)),self.fileName)

if(notos.path.isfile(htmlFileName)):

print'Failedtogethtmlcontentfromhttp://spys.ru/free-proxy-list/IE/'

printout

sys.exit(3)

mutex.acquire()

PROXYList=getTagsByAttrsExt('font',htmlFileName,'class','spy14$')

forproxyinPROXYList:

tdContent=proxy.renderContents()

lineElems=re.split('[<>]',tdContent)

ifre.compile(r'd+').search(lineElems[-1])andre.compile('(d+.d+.d+)').search(lineElems[0]):

printlineElems[0],lineElems[-1]

PROXY_LIST.append("%s:%s"%(lineElems[0],lineElems[-1]))

mutex.release()

try:

ifos.path.isfile(htmlFileName):

os.remove(htmlFileName)

except:

pass

if__name__=='__main__':

try:

if(len(sys.argv))<2:

print"Usage:%s[casperjspath]"%(sys.argv[0])

sys.exit(1)

ifnotos.path.exists(sys.argv[1]):

print"casperjspath:%sdoesnotexist!"%(sys.argv[1])

sys.exit(2)

ifos.path.isfile(output_file):

f=open(output_file)

lines=f.readlines()

f.close

forlineinlines:

PROXY_LIST.append(line.strip())

thread1=Site1Thread(sys.argv[1])

thread1.start()

thread1.join()

f=open(output_file,'w')

forproxyinset(PROXY_LIST):

f.write(proxy+"n")

f.close()

print"Done!"

exceptSystemExit:

pass

except:

errMsg=traceback.format_exc()

printerrMsg

log.error(errMsg)