Python实战计划 爬取大规模数据

300条广州号码

笔记

  • 观察页面特征
  • 不同区域数目问题
  • 设计工作流程(设计两个爬虫)
  • ​增加页面判断,错误页面不予爬取

Aim

效果

输出

Code

挑选页面花了好久时间…

还有对于404页面排除

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#-*- coding: utf8 -*-
from pymongo import MongoClient
from bs4 import BeautifulSoup
import requests
import time

host = 'localhost'
port = 27017

client = MongoClient(host,port)
db = client['db']
url_sheet = db['url_sheet']
info_sheet = db['info_sheet']

#pages 1 to 10
Base_Url = ['http://gz.58.com/shoujihao/pn{}/'.format(str(i)) for i in range(1,11)]
headers = {
'Cookie':'...'
}

titleSel = '#infolist > div > ul > div > ul > li > a > strong'

def checkNumber(num):
for i in range(0,len(num)):
if num[i] < '0' or num[i] > '9':
return (False)
else:
pass
return (True)

tempOut = open('out.txt','w')

def Get_Url(url):
time.sleep(0.5)
wb_data = requests.get(url, headers=headers)
soup = BeautifulSoup(wb_data.text,'lxml')
hrefs = soup.select(hrefSel)

for p in hrefs:
if p.get_text().strip().find('精准') == -1:#delete else
# 提取的内容有很多空白字符,需要replace替换掉
# "title": title_text.replace("\n", "").replace("\t", "").replace(" ", ""),
number = p.get_text().strip().replace('\n',' ').replace('\t',' ').replace(' ',' ').split(' ')[0]
if len(number) == 11:
url_sheet.insert_one(dict(number = number,url = p.get('href')))
else:
pass
else:
pass

TestInfoUrl = 'http://gz.58.com/shoujihao/26973010446666x.shtml?psid=192625604192771088616429809&entinfo=26973010446666_0'

PhoneSel = '#main > div.col.detailPrimary.mb15 > div.col_sub.mainTitle > h1'
ConnectSel = '#t_phone'
PutinTimeSel = '#main > div.col.detailPrimary.mb15 > div.col_sub.mainTitle > div > ul.mtit_con_left.fl > li.time'

def test404():
f = open('hehe.html','r')
soup = BeautifulSoup(f,'lxml')
str = soup.find_all('script')[-1].get_text()
if str[-6:-3] == '404':
print('404')



def Get_Info(url = TestInfoUrl):
time.sleep(1)
wb_data = requests.get(url, headers=headers)
soup = BeautifulSoup(wb_data.text,'lxml')
str = soup.find_all('script')[-1].get_text()
if str[-6:-3] == '404':
print('404')
pass
else:
Phone = soup.select(PhoneSel)[0].get_text().replace('\n',' ').strip()[:11]
cto = soup.select(ConnectSel)[0].get_text().replace('\n',' ').strip()
timer = soup.select(PutinTimeSel)[0].get_text().replace('\n',' ').strip()
#print(Phone,cto,timer,sep='#')
info_sheet.insert_one(dict(number=Phone,ConnectNumber=cto,PostTime=timer,Url=url))


def main():
for pU in Base_Url:
Get_Url(pU)
for url in url_sheet.find():
Get_Info(url['url'])
#Get_Info()
#test404()

if __name__ == '__main__':
main()

课程Code 笔记:分成两个爬虫分别爬取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# 在最左边是在python 中对象的名称,后面的是在数据库中的名称
# spider 1
def get_links_from(channel, pages, who_sells=0):
# td.t 没有这个就终止
list_view = '{}{}/pn{}/'.format(channel, str(who_sells), str(pages))
...
if soup.find('td', 't'):
...
# return urls
else:
# It's the last page !
pass

# spider 2
def get_item_info(url):
#404 finder
no_longer_exist = '404' in soup.find('script', type="text/javascript").get('src').split('/')
if no_longer_exist:
pass
else:
#info Getter
title = soup.title.text
price = soup.select('span.price.c_f50')[0].text
date = soup.select('.time')[0].text
area = list(soup.select('.c_25d a')[0].stripped_strings) if soup.find_all('span', 'c_25d') else None
item_info.insert_one({'title': title, 'price': price, 'date': date, 'area': area, 'url': url})
print({'title': title, 'price': price, 'date': date, 'area': area, 'url': url})

总结

这节课在前几节内容上拓展了页面防错.

另外url页面爬取难度也变大了..有了很多很多的判断

两个爬虫的同时应用充分展现了数据库的魅力

权当复习和练手吧hhh level up~~~

今天七夕诶,祝情成眷

文章目录
  1. 1. 笔记
  • Aim
  • 效果
  • Code
  • 总结