如何用python爬取豆瓣讀書的數據

這兩天爬了豆瓣讀書的十萬條左右的書目信息，用時將近壹天，現在趁著這個空閑把代碼總結壹下，還是菜鳥，都是用的最簡單最笨的方法，還請路過的大神不吝賜教。

第壹步，先看壹下我們需要的庫：

import requests ? #用來請求網頁

from bs4 import BeautifulSoup #解析網頁

import time ?#設置延時時間，防止爬取過於頻繁被封IP號

import re #正則表達式庫

import pymysql ? #由於爬取的數據太多，我們要把他存入MySQL數據庫中，這個庫用於連接數據庫

import random #這個庫裏用到了產生隨機數的randint函數，和上面的time搭配，使爬取間隔時間隨機

這個是豆瓣的網址：x-sorttags-all

我們要從這裏獲取所有分類的標簽鏈接，進壹步去爬取裏面的信息，代碼先貼上來：

import requests

from bs4 import BeautifulSoup ? #導入庫

url="httom/tag/?icn=index-nav"

wb_data=requests.get(url) #請求網址

soup=BeautifulSoup(wb_data.text,"lxml") ?#解析網頁信息

tags=soup.select("#content > div > div.article > div > div > table > tbody > tr > td > a")

#根據CSS路徑查找標簽信息，CSS路徑獲取方法，右鍵-檢查-copy selector，tags返回的是壹個列表

for tag in tags:

tag=tag.get_text() #將列表中的每壹個標簽信息提取出來

helf="hom/tag/"

#觀察壹下豆瓣的網址，基本都是這部分加上標簽信息，所以我們要組裝網址，用於爬取標簽詳情頁

url=helf+str(tag)

print(url) #網址組裝完畢，輸出

以上我們便爬取了所有標簽下的網址，我們將這個文件命名為channel,並在channel中創建壹個channel字符串，放上我們所有爬取的網址信息，等下爬取詳情頁的時候直接從這裏提取鏈接就好了，如下：

channel='''

tag/程序

'''

現在，我們開始第二個程序。

QQ圖片20160915233329.png

標簽頁下每壹個圖片的信息基本都是這樣的，我們可以直接從這裏提取到標題，作者，出版社，出版時間，價格，評價人數，以及評分等信息（有些外國作品還會有譯者信息），提取方法與提取標簽類似，也是根據CSS路徑提取。

我們先用壹個網址來實驗爬取：

url="htt/tag/科技"

wb_data = requests.get(url)

soup = BeautifulSoup(wb_data.text.encode("utf-8"), "lxml")

tag=url.split("?")[0].split("/")[-1] #從鏈接裏面提取標簽信息，方便存儲

detils=soup.select("#subject_list > ul > li > div.info > div.pub") ?#抓取作者，出版社信息，稍後我們用spite()函數再將他們分離出來

scors=soup.select("#subject_list > ul > li > div.info > div.star.clearfix > span.rating_nums") ? #抓取評分信息

persons=soup.select("#subject_list > ul > li > div.info > div.star.clearfix > span.pl") #評價人數

titles=soup.select("#subject_list > ul > li > div.info > h2 > a") ? #書名

#以上抓取的都是我們需要的html語言標簽信息，我們還需要將他們壹壹分離出來

for detil,scor,person,title in zip(detils,scors,persons,titles):

#用壹個zip()函數實現壹次遍歷

#因為壹些標簽中有譯者信息，壹些標簽中沒有，為避免錯誤，所以我們要用壹個try來把他們分開執行

try:

author=detil.get_text().split("/",4)[0].split()[0] #這是含有譯者信息的提取辦法，根據“/” ?把標簽分為五部分，然後依次提取出來

yizhe= detil.get_text().split("/", 4)[1]

publish=detil.get_text().split("/", 4)[2]

time=detil.get_text().split("/", 4)[3].split()[0].split("-")[0] ? #時間我們只提取了出版年份

price=ceshi_priceone(detil) #因為價格的單位不統壹，我們用壹個函數把他們換算為“元”

scoe=scor.get_text() if True else "" #有些書目是沒有評分的，為避免錯誤，我們把沒有評分的信息設置為空

person=ceshi_person(person) ?#有些書目的評價人數顯示少於十人，爬取過程中會出現錯誤，用壹個函數來處理

title=title.get_text().split()[0] ?

#當沒有譯者信息時，會顯示IndexError，我們分開處理

except IndexError:

try:

author=detil.get_text().split("/", 3)[0].split()[0]

yizhe="" #將detil信息劃分為4部分提取，譯者信息直接設置為空，其他與上面壹樣

publish=detil.get_text().split("/", 3)[1]

time=detil.get_text().split("/", 3)[2].split()[0].split("-")[0]

price=ceshi_pricetwo(detil)

scoe=scor.get_text() if True else ""

person=ceshi_person(person)

title=title.get_text().split()[0]

except (IndexError,TypeError):

continue ?

#出現其他錯誤信息，忽略，繼續執行（有些書目信息下會沒有出版社或者出版年份，但是數量很少，不影響我們大規模爬取，所以直接忽略）

except TypeError:

continue

#提取評價人數的函數，如果評價人數少於十人，按十人處理

def ceshi_person(person):

try:

person = int(person.get_text().split()[0][1:len(person.get_text().split()[0]) - 4])

except ValueError:

person = int(10)

return person

#分情況提取價格的函數，用正則表達式找到含有特殊字符的信息，並換算為“元”

def ceshi_priceone(price):

price = detil.get_text().split("/", 4)[4].split()

if re.match("USD", price[0]):

price = float(price[1]) * 6

elif re.match("CNY", price[0]):

price = price[1]

elif re.match("\A$", price[0]):

price = float(price[1:len(price)]) * 6

else:

price = price[0]

return price

def ceshi_pricetwo(price):

price = detil.get_text().split("/", 3)[3].split()

if re.match("USD", price[0]):

price = float(price[1]) * 6

elif re.match("CNY", price[0]):

price = price[1]

elif re.match("\A$", price[0]):

price = float(price[1:len(price)]) * 6

else:

price = price[0]

return price

實驗成功後，我們就可以爬取數據並導入到數據庫中了，以下為全部源碼，特殊情況會用註釋壹壹說明。

import requests

from bs4 import BeautifulSoup

import time

import re

import pymysql

from channel import channel ? #這是我們第壹個程序爬取的鏈接信息

import random

def ceshi_person(person):

try:

person = int(person.get_text().split()[0][1:len(person.get_text().split()[0]) - 4])

except ValueError:

person = int(10)

return person

def ceshi_priceone(price):

price = detil.get_text().split("/", 4)[4].split()

if re.match("USD", price[0]):

price = float(price[1]) * 6

elif re.match("CNY", price[0]):

price = price[1]

elif re.match("\A$", price[0]):

price = float(price[1:len(price)]) * 6

else:

price = price[0]

return price

def ceshi_pricetwo(price):

price = detil.get_text().split("/", 3)[3].split()

if re.match("USD", price[0]):

price = float(price[1]) * 6

elif re.match("CNY", price[0]):

price = price[1]

elif re.match("\A$", price[0]):

price = float(price[1:len(price)]) * 6

else:

price = price[0]

return price

#這是上面的那個測試函數，我們把它放在主函數中

def mains(url):

wb_data = requests.get(url)

soup = BeautifulSoup(wb_data.text.encode("utf-8"), "lxml")

tag=url.split("?")[0].split("/")[-1]

detils=soup.select("#subject_list > ul > li > div.info > div.pub")

scors=soup.select("#subject_list > ul > li > div.info > div.star.clearfix > span.rating_nums")

persons=soup.select("#subject_list > ul > li > div.info > div.star.clearfix > span.pl")

titles=soup.select("#subject_list > ul > li > div.info > h2 > a")

for detil,scor,person,title in zip(detils,scors,persons,titles):

l = [] ?#建壹個列表，用於存放數據

try:

author=detil.get_text().split("/",4)[0].split()[0]

yizhe= detil.get_text().split("/", 4)[1]

publish=detil.get_text().split("/", 4)[2]

time=detil.get_text().split("/", 4)[3].split()[0].split("-")[0]

price=ceshi_priceone(detil)

scoe=scor.get_text() if True else ""

person=ceshi_person(person)

title=title.get_text().split()[0]

except IndexError:

try:

author=detil.get_text().split("/", 3)[0].split()[0]

yizhe=""

publish=detil.get_text().split("/", 3)[1]

time=detil.get_text().split("/", 3)[2].split()[0].split("-")[0]

price=ceshi_pricetwo(detil)

scoe=scor.get_text() if True else ""

person=ceshi_person(person)

title=title.get_text().split()[0]

except (IndexError,TypeError):

continue ?

except TypeError:

continue

l.append([title,scoe,author,price,time,publish,person,yizhe,tag])

#將爬取的數據依次填入列表中

sql="INSERT INTO allbooks values(%s,%s,%s,%s,%s,%s,%s,%s,%s)" ?#這是壹條sql插入語句

cur.executemany(sql,l) ? #執行sql語句，並用executemary()函數批量插入數據庫中

conn.commit()

#主函數到此結束

# 將Python連接到MySQL中的python數據庫中

conn = pymysql.connect( user="root",password="123123",database="python",charset='utf8')

cur = conn.cursor()

cur.execute('DROP TABLE IF EXISTS allbooks') ? #如果數據庫中有allbooks的數據庫則刪除

sql = """CREATE TABLE allbooks(

title CHAR(255) NOT NULL,

scor CHAR(255),

author CHAR(255),

price CHAR(255),

time CHAR(255),

publish CHAR(255),

person CHAR(255),

yizhe CHAR(255),

tag CHAR(255)

)"""

cur.execute(sql) ?#執行sql語句，新建壹個allbooks的數據庫

start = time.clock() ? #設置壹個時鐘，這樣我們就能知道我們爬取了多長時間了

for urls in channel.split():

urlss=[urls+"?start={}&type=T".format(str(i)) for i in range(0,980,20)] ? #從channel中提取url信息，並組裝成每壹頁的鏈接

for url in urlss:

mains(url) ? #執行主函數，開始爬取

print(url) #輸出要爬取的鏈接，這樣我們就能知道爬到哪了，發生錯誤也好處理

time.sleep(int(format(random.randint(0,9)))) ? #設置壹個隨機數時間，每爬壹個網頁可以隨機的停壹段時間，防止IP被封

end = time.clock()

print('Time Usage:', end - start) #爬取結束，輸出爬取時間

count = cur.execute('select * from allbooks')

print('has %s record' % count) ? #輸出爬取的總數目條數

# 釋放數據連接

if cur:

cur.close()

if conn:

conn.close()

這樣，壹個程序就算完成了，豆瓣的書目信息就壹條條地寫進了我們的數據庫中，當然，在爬取的過程中，也遇到了很多問題，比如標題返回的信息拆分後中會有空格，寫入數據庫中會出現錯誤，所以只截取了標題的第壹部分，因而導致數據庫中的壹些書名不完整，過往的大神如果有什麽辦法，還請指教壹二。

等待爬取的過程是漫長而又欣喜的，看著電腦上壹條條信息被刷出來，成就感就不知不覺湧上心頭；然而如果妳吃飯時它在爬，妳上廁所時它在爬，妳都已經爬了個山回來了它還在爬時，便會有點崩潰了，擔心電腦隨時都會壞掉（還是窮學生換不起啊啊啊啊~）

所以，還是要好好學學設置斷點，多線程，以及正則，路漫漫其修遠兮，吾將上下而求索~***勉~

上一篇:編碼的字符集編

下一篇:黃石有什麽好的職高技術學院？

大學生科研項目申請理由

無人機創業計劃書

完美勝任1080P遊戲，RTX 3050 Ti筆記本電腦GPU性能測試

好代碼的味道：戰略篇

遊戲Ui跟其他UI設計什麽區別？學的內容是壹樣的還是有很大區別呢？

支付寶可以轉賬給市民卡嗎？

面試電工的題目啊，來幫幫忙，PLC主要由哪些電路組成?他程序是怎樣執行型的(工作原理)