當前位置:編程學習大全網 - 源碼下載 - python 如何抓取動態頁面內容?

python 如何抓取動態頁面內容?

輸入url,得到html,我早就寫了函數了

自己搜:

getUrlRespHtml

就可以找到對應的python函數:

#------------------------------------------------------------------------------

def?getUrlResponse(url,?postDict={},?headerDict={},?timeout=0,?useGzip=False,?postDataDelimiter="&")?:

"""Get?response?from?url,?support?optional?postDict,headerDict,timeout,useGzip

Note:

1.?if?postDict?not?null,?url?request?auto?become?to?POST?instead?of?default?GET

2?if?you?want?to?auto?handle?cookies,?should?call?initAutoHandleCookies()?before?use?this?function.

then?following?urllib2.Request?will?auto?handle?cookies

"""

#?makesure?url?is?string,?not?unicode,?otherwise?urllib2.urlopen?will?error

url?=?str(url);

if?(postDict)?:

if(postDataDelimiter=="&"):

postData?=?urllib.urlencode(postDict);

else:

postData?=?"";

for?eachKey?in?postDict.keys()?:

postData?+=?str(eachKey)?+?"="?+?str(postDict[eachKey])?+?postDataDelimiter;

postData?=?postData.strip();

logging.info("postData=%s",?postData);

req?=?urllib2.Request(url,?postData);

logging.info("req=%s",?req);

req.add_header('Content-Type',?"application/x-www-form-urlencoded");

else?:

req?=?urllib2.Request(url);

defHeaderDict?=?{

'User-Agent':?gConst['UserAgent'],

'Cache-Control'?:?'no-cache',

'Accept':?'*/*',

'Connection':?'Keep-Alive',

};

#?add?default?headers?firstly

for?eachDefHd?in?defHeaderDict.keys()?:

#print?"add?default?header:?%s=%s"%(eachDefHd,defHeaderDict[eachDefHd]);

req.add_header(eachDefHd,?defHeaderDict[eachDefHd]);

if(useGzip)?:

#print?"use?gzip?for",url;

req.add_header('Accept-Encoding',?'gzip,?deflate');

#?add?customized?header?later?->?allow?overwrite?default?header?

if(headerDict)?:

#print?"added?header:",headerDict;

for?key?in?headerDict.keys()?:

req.add_header(key,?headerDict[key]);

if(timeout?>?0)?:

#?set?timeout?value?if?necessary

resp?=?urllib2.urlopen(req,?timeout=timeout);

else?:

resp?=?urllib2.urlopen(req);

#update?cookies?into?local?file

if(gVal['cookieUseFile']):

gVal['cj'].save();

logging.info("gVal['cj']=%s",?gVal['cj']);

return?resp;

#------------------------------------------------------------------------------

#?get?response?html==body?from?url

#def?getUrlRespHtml(url,?postDict={},?headerDict={},?timeout=0,?useGzip=False)?:

def?getUrlRespHtml(url,?postDict={},?headerDict={},?timeout=0,?useGzip=True,?postDataDelimiter="&")?:

resp?=?getUrlResponse(url,?postDict,?headerDict,?timeout,?useGzip,?postDataDelimiter);

respHtml?=?resp.read();

#here,?maybe,?even?if?not?send?Accept-Encoding:?gzip,?deflate

#but?still?response?gzip?or?deflate,?so?directly?do?undecompress

#if(useGzip)?:

#print?"---before?unzip,?len(respHtml)=",len(respHtml);

respInfo?=?resp.info();

#?Server:?nginx/1.0.8

#?Date:?Sun,?08?Apr?2012?12:30:35?GMT

#?Content-Type:?text/html

#?Transfer-Encoding:?chunked

#?Connection:?close

#?Vary:?Accept-Encoding

#?...

#?Content-Encoding:?gzip

#?sometime,?the?request?use?gzip,deflate,?but?actually?returned?is?un-gzip?html

#?->?response?info?not?include?above?"Content-Encoding:?gzip"

#?eg:?/s/comment_730793bf010144j7_3.html

#?->?so?here?only?decode?when?it?is?indeed?is?gziped?data

#Content-Encoding:?deflate

if("Content-Encoding"?in?respInfo):

if("gzip"?==?respInfo['Content-Encoding']):

respHtml?=?zlib.decompress(respHtml,?16+zlib.MAX_WBITS);

elif("deflate"?==?respInfo['Content-Encoding']):

respHtml?=?zlib.decompress(respHtml,?-zlib.MAX_WBITS);

return?respHtml;

及示例代碼:

url?=?"";

respHtml?=?getUrlRespHtml(url);

完全庫函數,自己搜:

crifanLib.py

關於抓取動態頁面,詳見:

Python專題教程:抓取網站,模擬登陸,抓取動態網頁

(自己搜標題即可找到)

  • 上一篇:微信大家來找茬遊戲蜂窩輔助教程
  • 下一篇:如何理解java中實例化接口?
  • copyright 2024編程學習大全網