輸入url,得到html,我早就寫了函數了
自己搜:
getUrlRespHtml
就可以找到對應的python函數:
#------------------------------------------------------------------------------
def?getUrlResponse(url,?postDict={},?headerDict={},?timeout=0,?useGzip=False,?postDataDelimiter="&")?:
"""Get?response?from?url,?support?optional?postDict,headerDict,timeout,useGzip
Note:
1.?if?postDict?not?null,?url?request?auto?become?to?POST?instead?of?default?GET
2?if?you?want?to?auto?handle?cookies,?should?call?initAutoHandleCookies()?before?use?this?function.
then?following?urllib2.Request?will?auto?handle?cookies
"""
#?makesure?url?is?string,?not?unicode,?otherwise?urllib2.urlopen?will?error
url?=?str(url);
if?(postDict)?:
if(postDataDelimiter=="&"):
postData?=?urllib.urlencode(postDict);
else:
postData?=?"";
for?eachKey?in?postDict.keys()?:
postData?+=?str(eachKey)?+?"="?+?str(postDict[eachKey])?+?postDataDelimiter;
postData?=?postData.strip();
logging.info("postData=%s",?postData);
req?=?urllib2.Request(url,?postData);
logging.info("req=%s",?req);
req.add_header('Content-Type',?"application/x-www-form-urlencoded");
else?:
req?=?urllib2.Request(url);
defHeaderDict?=?{
'User-Agent':?gConst['UserAgent'],
'Cache-Control'?:?'no-cache',
'Accept':?'*/*',
'Connection':?'Keep-Alive',
};
#?add?default?headers?firstly
for?eachDefHd?in?defHeaderDict.keys()?:
#print?"add?default?header:?%s=%s"%(eachDefHd,defHeaderDict[eachDefHd]);
req.add_header(eachDefHd,?defHeaderDict[eachDefHd]);
if(useGzip)?:
#print?"use?gzip?for",url;
req.add_header('Accept-Encoding',?'gzip,?deflate');
#?add?customized?header?later?->?allow?overwrite?default?header?
if(headerDict)?:
#print?"added?header:",headerDict;
for?key?in?headerDict.keys()?:
req.add_header(key,?headerDict[key]);
if(timeout?>?0)?:
#?set?timeout?value?if?necessary
resp?=?urllib2.urlopen(req,?timeout=timeout);
else?:
resp?=?urllib2.urlopen(req);
#update?cookies?into?local?file
if(gVal['cookieUseFile']):
gVal['cj'].save();
logging.info("gVal['cj']=%s",?gVal['cj']);
return?resp;
#------------------------------------------------------------------------------
#?get?response?html==body?from?url
#def?getUrlRespHtml(url,?postDict={},?headerDict={},?timeout=0,?useGzip=False)?:
def?getUrlRespHtml(url,?postDict={},?headerDict={},?timeout=0,?useGzip=True,?postDataDelimiter="&")?:
resp?=?getUrlResponse(url,?postDict,?headerDict,?timeout,?useGzip,?postDataDelimiter);
respHtml?=?resp.read();
#here,?maybe,?even?if?not?send?Accept-Encoding:?gzip,?deflate
#but?still?response?gzip?or?deflate,?so?directly?do?undecompress
#if(useGzip)?:
#print?"---before?unzip,?len(respHtml)=",len(respHtml);
respInfo?=?resp.info();
#?Server:?nginx/1.0.8
#?Date:?Sun,?08?Apr?2012?12:30:35?GMT
#?Content-Type:?text/html
#?Transfer-Encoding:?chunked
#?Connection:?close
#?Vary:?Accept-Encoding
#?...
#?Content-Encoding:?gzip
#?sometime,?the?request?use?gzip,deflate,?but?actually?returned?is?un-gzip?html
#?->?response?info?not?include?above?"Content-Encoding:?gzip"
#?eg:?/s/comment_730793bf010144j7_3.html
#?->?so?here?only?decode?when?it?is?indeed?is?gziped?data
#Content-Encoding:?deflate
if("Content-Encoding"?in?respInfo):
if("gzip"?==?respInfo['Content-Encoding']):
respHtml?=?zlib.decompress(respHtml,?16+zlib.MAX_WBITS);
elif("deflate"?==?respInfo['Content-Encoding']):
respHtml?=?zlib.decompress(respHtml,?-zlib.MAX_WBITS);
return?respHtml;
及示例代碼:
url?=?"";respHtml?=?getUrlRespHtml(url);
完全庫函數,自己搜:
crifanLib.py
關於抓取動態頁面,詳見:
Python專題教程:抓取網站,模擬登陸,抓取動態網頁
(自己搜標題即可找到)