2016-08-08

Python实战计划爬取大规模数据

300条广州号码

笔记

观察页面特征
不同区域数目问题
设计工作流程(设计两个爬虫)
增加页面判断,错误页面不予爬取

Aim

效果

输出

Code

挑选页面花了好久时间…

还有对于404页面排除

#-*- coding: utf8 -*-
from pymongo import  MongoClient
from bs4 import BeautifulSoup
import requests
import time

host = 'localhost'
port = 27017

client = MongoClient(host,port)
db = client['db']
url_sheet = db['url_sheet']
info_sheet = db['info_sheet']

#pages 1 to 10
Base_Url = ['http://gz.58.com/shoujihao/pn{}/'.format(str(i)) for i in range(1,11)]
headers = {
    'Cookie':'...'
}

titleSel = '#infolist > div > ul > div > ul > li > a > strong'

def checkNumber(num):
    for i in range(0,len(num)):
        if num[i] < '0' or num[i] > '9':
            return (False)
        else:
            pass
    return (True)

tempOut = open('out.txt','w')

def Get_Url(url):
    time.sleep(0.5)
    wb_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(wb_data.text,'lxml')
    hrefs = soup.select(hrefSel)

    for p in hrefs:
        if p.get_text().strip().find('精准') == -1:#delete else
            # 提取的内容有很多空白字符，需要replace替换掉
    		# "title": title_text.replace("\n", "").replace("\t", "").replace(" ", ""),
            number = p.get_text().strip().replace('\n',' ').replace('\t',' ').replace('  ',' ').split(' ')[0]
            if len(number) == 11:
                url_sheet.insert_one(dict(number = number,url = p.get('href')))
            else:
                pass
        else:
            pass

TestInfoUrl = 'http://gz.58.com/shoujihao/26973010446666x.shtml?psid=192625604192771088616429809&entinfo=26973010446666_0'

PhoneSel = '#main > div.col.detailPrimary.mb15 > div.col_sub.mainTitle > h1'
ConnectSel = '#t_phone'
PutinTimeSel = '#main > div.col.detailPrimary.mb15 > div.col_sub.mainTitle > div > ul.mtit_con_left.fl > li.time'

def test404():
    f = open('hehe.html','r')
    soup = BeautifulSoup(f,'lxml')
    str = soup.find_all('script')[-1].get_text()
    if str[-6:-3] == '404':
        print('404')



def Get_Info(url = TestInfoUrl):
    time.sleep(1)
    wb_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(wb_data.text,'lxml')
    str = soup.find_all('script')[-1].get_text()
    if str[-6:-3] == '404':
        print('404')
        pass
    else:
        Phone = soup.select(PhoneSel)[0].get_text().replace('\n',' ').strip()[:11]
        cto = soup.select(ConnectSel)[0].get_text().replace('\n',' ').strip()
        timer = soup.select(PutinTimeSel)[0].get_text().replace('\n',' ').strip()
        #print(Phone,cto,timer,sep='#')
        info_sheet.insert_one(dict(number=Phone,ConnectNumber=cto,PostTime=timer,Url=url))


def main():
    for pU in Base_Url:
        Get_Url(pU)
    for url in url_sheet.find():
        Get_Info(url['url'])
    #Get_Info()
    #test404()

if __name__ == '__main__':
    main()

课程Code 笔记:分成两个爬虫分别爬取

# 在最左边是在python 中对象的名称，后面的是在数据库中的名称
# spider 1
def get_links_from(channel, pages, who_sells=0):
    # td.t 没有这个就终止
    list_view = '{}{}/pn{}/'.format(channel, str(who_sells), str(pages))
    ...
    if soup.find('td', 't'):
    	...
    	# return urls
    else:
        # It's the last page !
        pass

# spider 2
def get_item_info(url):
#404 finder
    no_longer_exist = '404' in soup.find('script', type="text/javascript").get('src').split('/')
    if no_longer_exist:
        pass
    else:
#info Getter
        title = soup.title.text
        price = soup.select('span.price.c_f50')[0].text
        date = soup.select('.time')[0].text
        area = list(soup.select('.c_25d a')[0].stripped_strings) if soup.find_all('span', 'c_25d') else None
        item_info.insert_one({'title': title, 'price': price, 'date': date, 'area': area, 'url': url})
        print({'title': title, 'price': price, 'date': date, 'area': area, 'url': url})

总结

这节课在前几节内容上拓展了页面防错.

另外url页面爬取难度也变大了..有了很多很多的判断

两个爬虫的同时应用充分展现了数据库的魅力

权当复习和练手吧hhh level up~~~

今天七夕诶,祝情成眷

本文标题:Python实战计划爬取大规模数据

文章作者:Renld

发布时间:2016年08月08日 - 20时51分

最后更新:2016年08月09日 - 16时06分

原始链接:http://renld.github.io/2016/08/08/PL2-2/

许可协议: "署名-非商用-相同方式共享 3.0" 转载请保留原文链接及作者。