第二周实战作业:爬取一万商品数据

赶集啦~~~

Aim

Result

ScreenShot

8khref

3kinformation

PrintOut

由于没有开time.sleep所以会被反扒, 卡死之后一般用ctrl+c停止那个类目爬取……

二级页面爬取成功会输出页面名,否则会输出停止页面

1
2
3
4
5
6
7
8
9
10
11
end at /jiaju/o39/
/rirongbaihuo/ end at 168

Fail to Get Info fromhttp://cs.ganji.com/shouji/2245928138x.htm
Fail to Get Info fromhttp://cs.ganji.com/shouji/2245928138x.htm
Fail to Get Info fromhttp://cs.ganji.com/shouji/2201642961x.htm
Fail to Get Info fromhttp://cs.ganji.com/shouji/2201642961x.htm
Fail to Get Info fromhttp://cs.ganji.com/shouji/2178792516x.htm
Fail to Get Info fromhttp://cs.ganji.com/shouji/2178792516x.htm

另外没有报错????打开上述页面发现是被买走了

Code

封装Soup

1
2
3
4
5
6
7
8
9
10
11
12
def GetSoup(url,buquan=1):
if buquan:
wb_data = requests.get((BaseUrl + url),timeout=2) # 因为有的页面需要前缀.
else:
wb_data = requests.get(url)
wb_data.encoding = 'utf8' # 加上这句否则会乱码!!!
if wb_data.status_code == 200:
soup = BeautifulSoup(wb_data.text,'lxml')
return soup
else:
print('Fail to Get Info from'+url)
return None

得到channel

1
2
3
4
5
6
7
8
9
10
11
12
TypeSel = '#wrapper > div.content > div > div > dl > dt > a'

def GetChannal(url):
soup = GetSoup(url)
if soup == None:
return None
else:
types = soup.select(TypeSel)
for type in types:
href = type.get('href')
title = type.get_text()
FirstUrl.insert_one(dict(title=title,href=href))

爬取每个channal

  1. 用set来排除重复信息(因为赶集网后面的页面会定向为同一个..)
  2. return一个爬取成功与否的值
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
GoodSel = '#wrapper > div.leftBox > div.layoutlist > dl > dd.feature > div > ul > li > a'

ThirdSet = set()

def GetGoods(url):
soup = GetSoup(url)
if soup != None:
goods = soup.select(GoodSel)
for good,p in zip(goods,range(1,10)):
title = good.get_text().strip()
href = good.get('href')
data = dict(title=title,href=href)
if data['href'] in ThirdSet:
return False
else:
ThirdSet.add(data['href'])
SecondUrl.insert_one(data)
return True
else:
return False

def GetGoodsUrl():
for up in FirstUrl.find():
base = up['href']
for p in range(1, 10000):
st = base + 'o' + str(p) + '/'
try:
if GetGoods(st) == False:
print(base, 'end at', str(p))#爬取成功
break
except:
print('error in page', st)#有bug
pass

爬取每个商品页面

应该开多进程的….

不过分了几个函数也不会挂得太惨咯..

place和新旧类别挺麻烦的QAQ

因为缺少信息会挂一片(比如jiaju页面很多都没有新旧…)

所以删掉try从小搞一下..来debug

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
ttSel = '#wrapper > div.content.clearfix > div.leftBox > div.col-cont.title-box > h1'
tmSel = '#wrapper > div.content.clearfix > div.leftBox > div.col-cont.title-box > div > ul.title-info-l.clearfix > li > i'
tpSel = '#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > span > a'
pcSel = '#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > i.f22.fc-orange.f-type'
plSel = '#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > a'
newSel = '#wrapper > div.content.clearfix > div.leftBox > div > div.det-summary > div > div.second-dt-bewrite > ul > li'

def GetGoodfInfo(url):
soup = GetSoup(url,buquan=0)
if soup != None:
titles = soup.select(ttSel)[0].get_text()
timers = soup.select(tmSel)[0].get_text().split('\\')[0].strip().split('\xa0')[0]
types = soup.select(tpSel)[5].get_text()
prices = soup.select(pcSel)[0].get_text()
places = soup.select(plSel)
place = ''.join(places[i].get_text() for i in range(1,4))
news = soup.select(newSel)[0].get_text().split(':')[1].replace('\n','').strip()
#print('place',place)
#print('type',types)
data = dict(title=titles,time=timers,type=types,price=prices,place=place,new=news)
#print(data)
Info.insert_one(data)

完整Code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#-*- coding: utf8 -*-
from bs4 import BeautifulSoup
from pymongo import MongoClient
import requests


client = MongoClient('localhost',27017)
dataBase = client['dataBase']
FirstUrl = dataBase['FirstUrl']
SecondUrl = dataBase['SecondUrl']
Info = dataBase['Info']


BaseUrl = 'http://cs.ganji.com'

ErShouUrl = '/wu/'

headers = {
'Cookie':'ganji_uuid=3031153094415879005517; ganji_xuuid=3d4ba517-3b7d-4f20-e0d7-05de7a77b9c9.1459734446990; t3=2; statistics_clientid=me; __utmt=1; GANJISESSID=50071f0ac4021a7aa6fcfc7c52f229fa; STA_DS=1; lg=1; ganji_login_act=1470799155185; citydomain=cs; _gl_tracker=%7B%22ca_source%22%3A%22www.baidu.com%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22seo_baidu%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A40680747040%7D; __utma=32156897.1667757214.1468391541.1468391541.1470799081.2; __utmb=32156897.7.10.1470799081; __utmc=32156897; __utmz=32156897.1470799081.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}

def GetSoup(url,buquan=1):
if buquan:
wb_data = requests.get((BaseUrl + url),timeout=2)
else:
wb_data = requests.get(url)
wb_data.encoding = 'utf8' # 加上这句否则会乱码!!!
if wb_data.status_code == 200:
soup = BeautifulSoup(wb_data.text,'lxml')
return soup
else:
print('Fail to Get Info from'+url)
return None

TypeSel = '#wrapper > div.content > div > div > dl > dt > a'

def GetChannal(url):
soup = GetSoup(url)
if soup == None:
return None
else:
types = soup.select(TypeSel)
for type in types:
href = type.get('href')
title = type.get_text()
FirstUrl.insert_one(dict(title=title,href=href))

GoodSel = '#wrapper > div.leftBox > div.layoutlist > dl > dd.feature > div > ul > li > a'

ThirdSet = set()

def GetGoods(url):
soup = GetSoup(url)
if soup != None:
goods = soup.select(GoodSel)
for good,p in zip(goods,range(1,10)):
title = good.get_text().strip()
href = good.get('href')
data = dict(title=title,href=href)
if data['href'] in ThirdSet:
return False
else:
ThirdSet.add(data['href'])
SecondUrl.insert_one(data)
return True
else:
return False

ttSel = '#wrapper > div.content.clearfix > div.leftBox > div.col-cont.title-box > h1'
tmSel = '#wrapper > div.content.clearfix > div.leftBox > div.col-cont.title-box > div > ul.title-info-l.clearfix > li > i'
tpSel = '#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > span > a'
pcSel = '#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > i.f22.fc-orange.f-type'
plSel = '#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > a'
newSel = '#wrapper > div.content.clearfix > div.leftBox > div > div.det-summary > div > div.second-dt-bewrite > ul > li'

def GetGoodfInfo(url):
soup = GetSoup(url,buquan=0)
if soup != None:
titles = soup.select(ttSel)[0].get_text()
timers = soup.select(tmSel)[0].get_text().split('\\')[0].strip().split('\xa0')[0] if soup.select(tmSel) != [] else None
types = soup.select(tpSel)[5].get_text()
prices = soup.select(pcSel)[0].get_text()
places = soup.select(plSel)
if len(places) > 4:
place = ''.join(places[i].get_text() for i in range(1,4))
else:
place = None
news = soup.select(newSel)[0].get_text().replace('\n','').replace(' ','').strip() if soup.select(newSel) != [] else None
#print('place',place)
#print('type',types)
data = dict(title=titles,time=timers,type=types,price=prices,place=place,new=news)
#print(data)
Info.insert_one(data)

def GetGoodsUrl():
for up in FirstUrl.find():
base = up['href']
for p in range(1, 10000):
st = base + 'o' + str(p) + '/'
try:
if GetGoods(st) == False:
print(base, 'end at', str(p))
break
except:
print('error in page', st)
pass

if __name__ == '__main__':
GetChannal(ErShouUrl)
GetGoodsUrl()
for url in SecondUrl.find():
GetGoodfInfo(url['href'])
try:
GetGoodfInfo(url['href'])
except Exception as e:
print(str(e), 'fail to get ', url['href'])
pass

心得与疑问

笔记

  1. 把’>’改成’ ‘之后不一定是严格下一级元素
  2. 访问量高的网站对于同一个ip的频次会有限制
  3. lambda x:x.text
  4. import lxml解析会快很多
  5. try少量还行,,大量bug太多还是得debug
1
2
3
4
5
try:
GetGoodfInfo(url['href'])
except Exception as e:
print(str(e), 'fail to get ', url['href'])
pass

心得

忘了判断404…好迷……..幸好跑之前加了个try

这次爬取的数据量突然变大后感受到了爬虫的魅力.

爬取时候没有用time.sleep似乎因为本身的延迟也没有挂的太惨

写了try之后那么挂了Ctrl+c能爬取接下来另一个页面

大数据量爬取得先爬几百个测一下字符串通用性~

疑问

但是有一些疑问

  1. 爬取的页面有一部分重复了一遍…是因为系统刷新了么…
  2. 爬取了8k个url之后爬不了了…
  3. 并没有理解出爬取大量页面和爬取少量的不同哇?

答复

重复可能是因为爬取规则有问题,或者网站本身的问题,所以重复的需要处理下。

8k url爬不了这个要看他提示什么错误了,如果没有提示错误就比较不好排除了。

大量和少量的不同很多,大量的爬取需要注意更多细节的东西,这不,到8k的url就爬不了。

chmod可以整个文件夹修改的,加上-R 不用一个个文件修改

另外我发现重复似乎是因为..同时run了两个py程序往数据库里面写……….

文章目录
  1. 1. Aim
  2. 2. Result
    1. 2.1. ScreenShot
    2. 2.2. PrintOut
  3. 3. Code
    1. 3.1. 封装Soup
    2. 3.2. 得到channel
    3. 3.3. 爬取每个channal
    4. 3.4. 爬取每个商品页面
    5. 3.5. 完整Code
  4. 4. 心得与疑问
    1. 4.1. 笔记
    2. 4.2. 心得
    3. 4.3. 疑问
    4. 4.4. 答复