前言
本文的文字及圖片來源于網絡,僅供學習、交流使用,不具有任何商業(yè)用途,版權歸原作者所有,如有問題請及時聯系我們以作處理。
作者: Star_Zhao
PS:如有需要Python/ target=_blank class=infotextkey>Python學習資料的小伙伴可以加點擊下方鏈接自行獲取http://t.cn/A6Zvjdun
本次爬取自如網房源信息所用到的知識點:
- requests get請求
- lxml解析html
- Xpath
- MongoDB存儲
正文
分析目標站點
- url: http://hz.ziroom.com/z/nl/z3.html?p=2 的p參數控制分頁
- get請求
獲取單頁源碼
# -*- coding: utf-8 -*-
import requests
import timefrom requests.exceptions import RequestException
def get_one_page(page): try:
url = "http://hz.ziroom.com/z/nl/z2.html?p=" + str(page)
headers = { 'Referer':'http://hz.ziroom.com/',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0(windowsNT6.3;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/68.0.3440.106Safari/537.36'
} res = requests.get(url,headers=headers)
if res.status_code == 200:
print(res.text) except RequestException: return None
def main(): page = 1
get_one_page(page)if __name__ == '__main__':
main() time.sleep(1)
解析單頁源碼
- 解析html文檔, 目的: 測試XPath表達式
將獲取的源碼保存到當前文件夾下的"result.html"中, 然后通過XPath對其進行相應內容的提取, 當然你也可以使用某些在線工具.
from lxml import etree
#解析html文檔html = etree.parse("./resul.html",etree.HTMLParser())results = html.xpath('//ul[@id="houseList"]/li')
for result in results[1:]:
title = result.xpath("./div/h3/a/text()")[0][5:] if len(result.xpath("./div/h3/a/text()")[0]) >5 else ""
location = result.xpath("./div/h4/a/text()")[0].replace("[","").replace("]",'') area = " ".join(result.xpath("./div/div/p[1]/span/text()")).replace(" ","",1) #使用join方法將列表中的內容以" "字符連接 nearby = result.xpath("./div/div/p[2]/span/text()")[0] print(title) print(location) print(area) print(nearby)
解析源代碼
from lxml import etree
def parse_one_page(sourcehtml):
'''解析單頁源碼''' contentTree = etree.HTML(sourcehtml) #解析源代碼 results = contentTree.xpath('//ul[@id="houseList"]/li') #利用XPath提取相應內容
for result in results[1:]:
title = result.xpath("./div/h3/a/text()")[0][5:] if len(result.xpath("./div/h3/a/text()")[0]) > 5 else ""
location = result.xpath("./div/h4/a/text()")[0].replace("[", "").replace("]", '') area = " ".join(result.xpath("./div/div/p[1]/span/text()")).replace(" ", "", 1) # 使用join方法將列表中的內容以" "字符連接 nearby = result.xpath("./div/div/p[2]/span/text()")[0] yield { "title": title, "location": location, "area": area, "nearby": nearby }def main(): page = 1 html = get_one_page(page)
print(type(html)) parse_one_page(html)
for item in parse_one_page(html):
print(item)if __name__ == '__main__':
main() time.sleep(1)
獲取多個頁面
def parse_one_page(sourcehtml):
'''解析單頁源碼'''
contentTree = etree.HTML(sourcehtml) #解析源代碼 results = contentTree.xpath('//ul[@id="houseList"]/li') #利用XPath提取相應內容
for result in results[1:]:
title = result.xpath("./div/h3/a/text()")[0][5:] if len(result.xpath("./div/h3/a/text()")[0]) > 5 else ""
location = result.xpath("./div/h4/a/text()")[0].replace("[", "").replace("]", '')
area = " ".join(result.xpath("./div/div/p[1]/span/text()")).replace(" ", "", 1) # 使用join方法將列表中的內容以" "字符連接
#nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip()這里需要加判斷, 改寫為下句
nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip() if len(result.xpath("./div/div/p[2]/span/text()"))>0 else ""
yield {
"title": title,
"location": location,
"area": area,
"nearby": nearby
} print(nearby)
#yield {"pages":pages}
def get_pages(): """得到總頁數"""
page = 1
html = get_one_page(page) contentTree = etree.HTML(html) pages = int(contentTree.xpath('//div[@class="pages"]/span[2]/text()')[0].strip("共頁"))
return pages
def main(): pages = get_pages() print(pages)
for page in range(1,pages+1):
html = get_one_page(page) for item in parse_one_page(html):
print(item)
if __name__ == '__main__':
main() time.sleep(1)
存儲到MongoDB中
需確保MongoDB已啟動服務, 否則必然會存儲失敗
def save_to_mongodb(result):
"""存儲到MongoDB中"""
# 創(chuàng)建數據庫連接對象, 即連接到本地
client = pymongo.MongoClient(host="localhost")
# 指定數據庫,這里指定ziroom
db = client.iroomz
# 指定表的名稱, 這里指定roominfo
db_table = db.roominfo
try:
#存儲到數據庫
if db_table.insert(result):
print("---存儲到數據庫成功---",result)
except Exception:
print("---存儲到數據庫失敗---",result)
完整代碼
1 # -*- coding: utf-8 -*-
2
3 import requests
4 import time
5 import pymongo
6 from lxml import etree
7 from requests.exceptions import RequestException
8 def get_one_page(page):
9 '''獲取單頁源碼'''
10 try:
11 url = "http://hz.ziroom.com/z/nl/z2.html?p=" + str(page)
12 headers = {
13 'Referer':'http://hz.ziroom.com/',
14 'Upgrade-Insecure-Requests':'1',
15 'User-Agent':'Mozilla/5.0(WindowsNT6.3;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/68.0.3440.106Safari/537.36'
16 }
17 res = requests.get(url,headers=headers)
18 if res.status_code == 200:
19 return res.text
20 return None
21 except RequestException:22 return None
23 def parse_one_page(sourcehtml):24 '''解析單頁源碼'''
25 contentTree = etree.HTML(sourcehtml) #解析源代碼
26 results = contentTree.xpath('//ul[@id="houseList"]/li') #利用XPath提取相應內容
27 for result in results[1:]:
28 title = result.xpath("./div/h3/a/text()")[0][5:] if len(result.xpath("./div/h3/a/text()")[0]) > 5 else ""
29 location = result.xpath("./div/h4/a/text()")[0].replace("[", "").replace("]", '')
30 area = " ".join(result.xpath("./div/div/p[1]/span/text()")).replace(" ", "", 1) # 使用join方法將列表中的內容以" "字符連接
31 #nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip()這里需要加判斷, 改寫為下句
32 nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip() if len(result.xpath("./div/div/p[2]/span/text()"))>0 else ""
33 data = {
34 "title": title,
35 "location": location,
36 "area": area,
37 "nearby": nearby
38 }39 save_to_mongodb(data)
40 #yield {"pages":pages}
41 def get_pages():
42 """得到總頁數"""
43 page = 1
44 html = get_one_page(page)
45 contentTree = etree.HTML(html)
46 pages = int(contentTree.xpath('//div[@class="pages"]/span[2]/text()')[0].strip("共頁"))
47 return pages
48 def save_to_mongodb(result):49 """存儲到MongoDB中"""
50 # 創(chuàng)建數據庫連接對象, 即連接到本地
51 client = pymongo.MongoClient(host="localhost")
52 # 指定數據庫,這里指定ziroom
53 db = client.iroomz54 # 指定表的名稱, 這里指定roominfo55 db_table = db.roominfo56 try:57 #存儲到數據庫
58 if db_table.insert(result):
59 print("---存儲到數據庫成功---",result)
60 except Exception:
61 print("---存儲到數據庫失敗---",result)
62
63 def main():
64 pages = get_pages()
65 print(pages)
66 for page in range(1,pages+1):
67 html = get_one_page(page)
68 parse_one_page(html)
69
70 if __name__ == '__main__':
71 main()
72 time.sleep(1)
最終結果
總結
在第三步中XPath使用注意事項
title = result.xpath("./div/h3/a/text()")
此處的點'.'不能忘記, 它表示當前節(jié)點, 如果不加'.', '/'就表示從根節(jié)點開始選取
在第四步獲取多個頁面時出現索引超出范圍錯誤
nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip()
IndexError: list index out of range
造成這種錯誤原因有兩種:
- [index] index超出list范圍
- [index] index索引內容為空
因為這里的nearby的index是0, 排除第一種情況, 那么這里就是空行了, 加句if判斷就可以解決
nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip()
#改寫以后:nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip() if len(result.xpath("./div/div/p[2]/span/text()"))>0 else ""
以上主要是對爬蟲過程學習的總結, 若有不對的地方, 還請指正, 謝謝!