python 如何抓取動態頁面內容？

輸入url，得到html，我早就寫了函數了

自己搜：

getUrlRespHtml

就可以找到對應的python函數：

#------------------------------------------------------------------------------

def?getUrlResponse(url,?postDict={},?headerDict={},?timeout=0,?useGzip=False,?postDataDelimiter="&")?:

"""Get?response?from?url,?support?optional?postDict,headerDict,timeout,useGzip

Note:

1.?if?postDict?not?null,?url?request?auto?become?to?POST?instead?of?default?GET

2?if?you?want?to?auto?handle?cookies,?should?call?initAutoHandleCookies()?before?use?this?function.

then?following?urllib2.Request?will?auto?handle?cookies

"""

#?makesure?url?is?string,?not?unicode,?otherwise?urllib2.urlopen?will?error

url?=?str(url);

if?(postDict)?:

if(postDataDelimiter=="&"):

postData?=?urllib.urlencode(postDict);

else:

postData?=?"";

for?eachKey?in?postDict.keys()?:

postData?+=?str(eachKey)?+?"="?+?str(postDict[eachKey])?+?postDataDelimiter;

postData?=?postData.strip();

logging.info("postData=%s",?postData);

req?=?urllib2.Request(url,?postData);

logging.info("req=%s",?req);

req.add_header('Content-Type',?"application/x-www-form-urlencoded");

else?:

req?=?urllib2.Request(url);

defHeaderDict?=?{

'User-Agent':?gConst['UserAgent'],

'Cache-Control'?:?'no-cache',

'Accept':?'*/*',

'Connection':?'Keep-Alive',

};

#?add?default?headers?firstly

for?eachDefHd?in?defHeaderDict.keys()?:

#print?"add?default?header:?%s=%s"%(eachDefHd,defHeaderDict[eachDefHd]);

req.add_header(eachDefHd,?defHeaderDict[eachDefHd]);

if(useGzip)?:

#print?"use?gzip?for",url;

req.add_header('Accept-Encoding',?'gzip,?deflate');

#?add?customized?header?later?->?allow?overwrite?default?header?

if(headerDict)?:

#print?"added?header:",headerDict;

for?key?in?headerDict.keys()?:

req.add_header(key,?headerDict[key]);

if(timeout?>?0)?:

#?set?timeout?value?if?necessary

resp?=?urllib2.urlopen(req,?timeout=timeout);

else?:

resp?=?urllib2.urlopen(req);

#update?cookies?into?local?file

if(gVal['cookieUseFile']):

gVal['cj'].save();

logging.info("gVal['cj']=%s",?gVal['cj']);

return?resp;

#------------------------------------------------------------------------------

#?get?response?html==body?from?url

#def?getUrlRespHtml(url,?postDict={},?headerDict={},?timeout=0,?useGzip=False)?:

def?getUrlRespHtml(url,?postDict={},?headerDict={},?timeout=0,?useGzip=True,?postDataDelimiter="&")?:

resp?=?getUrlResponse(url,?postDict,?headerDict,?timeout,?useGzip,?postDataDelimiter);

respHtml?=?resp.read();

#here,?maybe,?even?if?not?send?Accept-Encoding:?gzip,?deflate

#but?still?response?gzip?or?deflate,?so?directly?do?undecompress

#if(useGzip)?:

#print?"---before?unzip,?len(respHtml)=",len(respHtml);

respInfo?=?resp.info();

#?Server:?nginx/1.0.8

#?Date:?Sun,?08?Apr?2012?12:30:35?GMT

#?Content-Type:?text/html

#?Transfer-Encoding:?chunked

#?Connection:?close

#?Vary:?Accept-Encoding

#?...

#?Content-Encoding:?gzip

#?sometime,?the?request?use?gzip,deflate,?but?actually?returned?is?un-gzip?html

#?->?response?info?not?include?above?"Content-Encoding:?gzip"

#?eg:?/s/comment_730793bf010144j7_3.html

#?->?so?here?only?decode?when?it?is?indeed?is?gziped?data

#Content-Encoding:?deflate

if("Content-Encoding"?in?respInfo):

if("gzip"?==?respInfo['Content-Encoding']):

respHtml?=?zlib.decompress(respHtml,?16+zlib.MAX_WBITS);

elif("deflate"?==?respInfo['Content-Encoding']):

respHtml?=?zlib.decompress(respHtml,?-zlib.MAX_WBITS);

return?respHtml;

及示例代碼：

url?=?"";

respHtml?=?getUrlRespHtml(url);

完全庫函數，自己搜：

crifanLib.py

關於抓取動態頁面，詳見：

Python專題教程：抓取網站，模擬登陸，抓取動態網頁

（自己搜標題即可找到）

上一篇:微信大家來找茬遊戲蜂窩輔助教程

下一篇:如何理解java中實例化接口？

ecshop商品圖片不清晰怎麽辦？圖片尺寸都是正確的！

Flash CS5動畫制作完全自學手冊的目錄

開發壹個微信小程序分銷系統要多少錢

基金下跌了壹輪，有人說可以進場抄底了，有沒有壹定賺錢的方法？

《天天鎖屏》賺錢技巧規則說明介紹

linux的jdk安裝linux版本jdk安裝

無線模塊的低功耗無線模塊