1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
| from bs4 import BeautifulSoup from pymongo import MongoClient import requests
client = MongoClient('localhost',27017) db = client['db'] bx = db['bx'] gbx = db['gbx']
ttS = '#infolist > div.infocon > table > tbody > tr > td.t > a' ppS = '#infolist > div.infocon > table > tbody > tr > td.t > span.pricebiao > span' ddS = '#infolist > div.infocon > table > tbody > tr > td.t > span.desc' header = { 'Cookie':'f=n; f=n; id58=c5/njVeF2seOw3jQEnUqAg==; als=0; bj58_id58s="c2tBeEE1UGpmLWxyMDA1Nw=="; bdshare_firstime=1468390750533; __utma=253535702.1519012623.1468390701.1468390701.1468390701.1; __utmz=253535702.1468390701.1.1.utmcsr=cs.58.com|utmccn=(referral)|utmcmd=referral|utmcct=/; Hm_lvt_3bb04d7a4ca3846dcc66a99c3e861511=1468390768; bangbigtip2=1; Hm_lvt_ef9ab733a6772ffc77e498ec3aee46bd=1470723993; final_history=26973010446666%2C26972994928313%2C26972613537589%2C26972732029743%2C26479081252529; city=gz; 58home=gz; ipcity=cs%7C%u957F%u6C99%7C0; myfeet_tooltip=end; sessionid=575563fa-ecbf-4cbd-bb83-a2a7a30be6e4; f=n; bj58_new_session=0; bj58_init_refer="http://gz.58.com/"; bj58_new_uv=5; 58tj_uuid=5fa8a0ce-253e-4b83-ad71-7f91586401b4; new_session=0; new_uv=8; utm_source=; spm=; init_refer=https%253A%252F%252Fwww.google.com.hk%252F', 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', }
gttS = '#wrapper > div.leftBox > div.layoutlist > dl > dd.feature > div > ul > li > a' gddS = '#wrapper > div.leftBox > div.layoutlist > dl > dd.feature > p' gppS = '#wrapper > div.leftBox > div.layoutlist > dl > dd.pt-other > p > span'
headers = { 'Cookie':'ganji_uuid=3031153094415879005517; ganji_xuuid=3d4ba517-3b7d-4f20-e0d7-05de7a77b9c9.1459734446990; t3=2; citydomain=gz; statistics_clientid=me; GANJISESSID=d0e973e2c407e720c03eff165ca9fe49; __utmt=1; STA_DS=1; ganji_login_act=1471756690422; lg=1; __utma=32156897.1667757214.1468391541.1471009400.1471755417.6; __utmb=32156897.6.10.1471755417; __utmc=32156897; __utmz=32156897.1471755417.6.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _gl_tracker=%7B%22ca_source%22%3A%22www.baidu.com%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22seo_baidu%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A46616764562%7D; _gl_speed=%5B%22%2Fbingxiang%2Fo30%2F%22%2C1471756700728%5D', 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', }
def GetUrl(url): wb_data = requests.get(url,timeout = 2,headers = header) wb_data.encoding = 'utf8' soup = BeautifulSoup(wb_data.text,'lxml') title = soup.select(ttS) price = soup.select(ppS) descr = soup.select(ddS) for a,b,c in zip(title,price,descr): bx.insert_one(dict(title=a.get_text(), price=b.get_text(), descr=c.get_text(), href =a.get('href'))) print('success get ',url)
urls = ['http://gz.58.com/bingxiang/pn{}'.format(i) for i in range(1,70)]
def gGetUrl(url): wb_data = requests.get(url,timeout = 2,headers = headers) wb_data.encoding = 'utf8' soup = BeautifulSoup(wb_data.text,'lxml') title = soup.select(gttS) descr = soup.select(gddS) price = soup.select(gppS) for a, b, c in zip(title, price, descr): if b.get_text().strip() != '超便宜': bx.insert_one(dict(title=a.get_text().strip(), price=b.get_text().strip(), descr=c.get_text().strip(), href=a.get('href'))) print('success get ',url) gurls = ['http://gz.ganji.com/bingxiang/o{}/'.format(i) for i in range(2,32)]
for dt in gurls: gGetUrl(dt)
|