2016-08-10

第二周实战作业：爬取一万商品数据

赶集啦~~~

Aim

Result

ScreenShot

8khref

3kinformation

PrintOut

由于没有开time.sleep所以会被反扒, 卡死之后一般用ctrl+c停止那个类目爬取……

二级页面爬取成功会输出页面名,否则会输出停止页面

end at /jiaju/o39/
/rirongbaihuo/ end at 168

Fail to Get Info fromhttp://cs.ganji.com/shouji/2245928138x.htm
Fail to Get Info fromhttp://cs.ganji.com/shouji/2245928138x.htm
Fail to Get Info fromhttp://cs.ganji.com/shouji/2201642961x.htm
Fail to Get Info fromhttp://cs.ganji.com/shouji/2201642961x.htm
Fail to Get Info fromhttp://cs.ganji.com/shouji/2178792516x.htm
Fail to Get Info fromhttp://cs.ganji.com/shouji/2178792516x.htm

另外没有报错????打开上述页面发现是被买走了

Code

封装Soup

def GetSoup(url,buquan=1):
    if buquan:
        wb_data = requests.get((BaseUrl + url),timeout=2) # 因为有的页面需要前缀.
    else:
        wb_data = requests.get(url)
    wb_data.encoding = 'utf8' # 加上这句否则会乱码!!!
    if wb_data.status_code == 200:
        soup = BeautifulSoup(wb_data.text,'lxml')
        return soup
    else:
        print('Fail to Get Info from'+url)
        return None

得到channel

TypeSel = '#wrapper > div.content > div > div > dl > dt > a'

def GetChannal(url):
    soup = GetSoup(url)
    if soup == None:
        return None
    else:
        types = soup.select(TypeSel)
        for type in types:
            href = type.get('href')
            title = type.get_text()
            FirstUrl.insert_one(dict(title=title,href=href))

爬取每个channal

用set来排除重复信息(因为赶集网后面的页面会定向为同一个..)
return一个爬取成功与否的值

GoodSel = '#wrapper > div.leftBox > div.layoutlist > dl > dd.feature > div > ul > li > a'

ThirdSet = set()

def GetGoods(url):
    soup = GetSoup(url)
    if soup != None:
        goods = soup.select(GoodSel)
        for good,p in zip(goods,range(1,10)):
            title = good.get_text().strip()
            href = good.get('href')
            data = dict(title=title,href=href)
            if data['href'] in ThirdSet:
                return False
            else:
                ThirdSet.add(data['href'])
                SecondUrl.insert_one(data)
        return True
    else:
        return False
    
def GetGoodsUrl():
    for up in FirstUrl.find():
        base = up['href']
        for p in range(1, 10000):
            st = base + 'o' + str(p) + '/'
            try:
                if GetGoods(st) == False:
                    print(base, 'end at', str(p))#爬取成功
                    break
            except:
                print('error in page', st)#有bug
                pass

爬取每个商品页面

应该开多进程的….

不过分了几个函数也不会挂得太惨咯..

place和新旧类别挺麻烦的QAQ

因为缺少信息会挂一片(比如jiaju页面很多都没有新旧…)

所以删掉try从小搞一下..来debug

ttSel = '#wrapper > div.content.clearfix > div.leftBox > div.col-cont.title-box > h1'
tmSel = '#wrapper > div.content.clearfix > div.leftBox > div.col-cont.title-box > div > ul.title-info-l.clearfix > li > i'
tpSel = '#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > span > a'
pcSel = '#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > i.f22.fc-orange.f-type'
plSel = '#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > a'
newSel = '#wrapper > div.content.clearfix > div.leftBox > div > div.det-summary > div > div.second-dt-bewrite > ul > li'

def GetGoodfInfo(url):
    soup = GetSoup(url,buquan=0)
    if soup != None:
        titles = soup.select(ttSel)[0].get_text()
        timers = soup.select(tmSel)[0].get_text().split('\\')[0].strip().split('\xa0')[0]
        types  = soup.select(tpSel)[5].get_text()
        prices = soup.select(pcSel)[0].get_text()
        places = soup.select(plSel)
        place = ''.join(places[i].get_text() for i in range(1,4))
        news = soup.select(newSel)[0].get_text().split(':')[1].replace('\n','').strip()
        #print('place',place)
        #print('type',types)
        data = dict(title=titles,time=timers,type=types,price=prices,place=place,new=news)
        #print(data)
        Info.insert_one(data)

完整Code

#-*- coding: utf8 -*-
from bs4 import  BeautifulSoup
from pymongo import MongoClient
import  requests


client = MongoClient('localhost',27017)
dataBase = client['dataBase']
FirstUrl = dataBase['FirstUrl']
SecondUrl = dataBase['SecondUrl']
Info = dataBase['Info']


BaseUrl = 'http://cs.ganji.com'

ErShouUrl = '/wu/'

headers = {
    'Cookie':'ganji_uuid=3031153094415879005517; ganji_xuuid=3d4ba517-3b7d-4f20-e0d7-05de7a77b9c9.1459734446990; t3=2; statistics_clientid=me; __utmt=1; GANJISESSID=50071f0ac4021a7aa6fcfc7c52f229fa; STA_DS=1; lg=1; ganji_login_act=1470799155185; citydomain=cs; _gl_tracker=%7B%22ca_source%22%3A%22www.baidu.com%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22seo_baidu%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A40680747040%7D; __utma=32156897.1667757214.1468391541.1468391541.1470799081.2; __utmb=32156897.7.10.1470799081; __utmc=32156897; __utmz=32156897.1470799081.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic',
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}

def GetSoup(url,buquan=1):
    if buquan:
        wb_data = requests.get((BaseUrl + url),timeout=2)
    else:
        wb_data = requests.get(url)
    wb_data.encoding = 'utf8' # 加上这句否则会乱码!!!
    if wb_data.status_code == 200:
        soup = BeautifulSoup(wb_data.text,'lxml')
        return soup
    else:
        print('Fail to Get Info from'+url)
        return None

TypeSel = '#wrapper > div.content > div > div > dl > dt > a'

def GetChannal(url):
    soup = GetSoup(url)
    if soup == None:
        return None
    else:
        types = soup.select(TypeSel)
        for type in types:
            href = type.get('href')
            title = type.get_text()
            FirstUrl.insert_one(dict(title=title,href=href))

GoodSel = '#wrapper > div.leftBox > div.layoutlist > dl > dd.feature > div > ul > li > a'

ThirdSet = set()

def GetGoods(url):
    soup = GetSoup(url)
    if soup != None:
        goods = soup.select(GoodSel)
        for good,p in zip(goods,range(1,10)):
            title = good.get_text().strip()
            href = good.get('href')
            data = dict(title=title,href=href)
            if data['href'] in ThirdSet:
                return False
            else:
                ThirdSet.add(data['href'])
                SecondUrl.insert_one(data)
        return True
    else:
        return False

ttSel = '#wrapper > div.content.clearfix > div.leftBox > div.col-cont.title-box > h1'
tmSel = '#wrapper > div.content.clearfix > div.leftBox > div.col-cont.title-box > div > ul.title-info-l.clearfix > li > i'
tpSel = '#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > span > a'
pcSel = '#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > i.f22.fc-orange.f-type'
plSel = '#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li > a'
newSel = '#wrapper > div.content.clearfix > div.leftBox > div > div.det-summary > div > div.second-dt-bewrite > ul > li'

def GetGoodfInfo(url):
    soup = GetSoup(url,buquan=0)
    if soup != None:
        titles = soup.select(ttSel)[0].get_text()
        timers = soup.select(tmSel)[0].get_text().split('\\')[0].strip().split('\xa0')[0] if soup.select(tmSel) != [] else None
        types  = soup.select(tpSel)[5].get_text()
        prices = soup.select(pcSel)[0].get_text()
        places = soup.select(plSel)
        if len(places) > 4:
            place = ''.join(places[i].get_text() for i in range(1,4))
        else:
            place = None
        news = soup.select(newSel)[0].get_text().replace('\n','').replace('  ','').strip() if soup.select(newSel) != [] else None
        #print('place',place)
        #print('type',types)
        data = dict(title=titles,time=timers,type=types,price=prices,place=place,new=news)
        #print(data)
        Info.insert_one(data)

def GetGoodsUrl():
    for up in FirstUrl.find():
        base = up['href']
        for p in range(1, 10000):
            st = base + 'o' + str(p) + '/'
            try:
                if GetGoods(st) == False:
                    print(base, 'end at', str(p))
                    break
            except:
                print('error in page', st)
                pass

if __name__ == '__main__':
    GetChannal(ErShouUrl)
    GetGoodsUrl()
    for url in SecondUrl.find():
        GetGoodfInfo(url['href'])
        try:
             GetGoodfInfo(url['href'])
        except Exception as e:
             print(str(e), 'fail to get ', url['href'])
             pass

心得与疑问

笔记

把’>’改成’ ‘之后不一定是严格下一级元素
访问量高的网站对于同一个ip的频次会有限制
lambda x:x.text
import lxml解析会快很多
try少量还行,,大量bug太多还是得debug

try:
           GetGoodfInfo(url['href'])
      except Exception as e:
           print(str(e), 'fail to get ', url['href'])
           pass

心得

忘了判断404…好迷……..幸好跑之前加了个try

这次爬取的数据量突然变大后感受到了爬虫的魅力.

爬取时候没有用time.sleep似乎因为本身的延迟也没有挂的太惨

写了try之后那么挂了Ctrl+c能爬取接下来另一个页面

大数据量爬取得先爬几百个测一下字符串通用性~

疑问

但是有一些疑问

爬取的页面有一部分重复了一遍…是因为系统刷新了么…
爬取了8k个url之后爬不了了…
并没有理解出爬取大量页面和爬取少量的不同哇?

答复

重复可能是因为爬取规则有问题，或者网站本身的问题，所以重复的需要处理下。

8k url爬不了这个要看他提示什么错误了，如果没有提示错误就比较不好排除了。

大量和少量的不同很多，大量的爬取需要注意更多细节的东西，这不，到8k的url就爬不了。

chmod可以整个文件夹修改的，加上-R 不用一个个文件修改

另外我发现重复似乎是因为..同时run了两个py程序往数据库里面写……….

本文标题:第二周实战作业：爬取一万商品数据

文章作者:Renld

发布时间:2016年08月10日 - 11时08分

最后更新:2016年08月11日 - 15时42分

原始链接:http://renld.github.io/2016/08/10/PL2-final/

许可协议: "署名-非商用-相同方式共享 3.0" 转载请保留原文链接及作者。