爬取图片

好喜欢这些书签…捐了两块钱给作者:)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#-*- coding: utf8 -*-
from bs4 import BeautifulSoup
from pymongo import MongoClient
import requests
import urllib.request

client = MongoClient('localhost',27017)
db = client['db']
sq = db['sq']

ttS = '#post-288 > div > p > a > img'
wb_data = requests.get('http://morning.rocks/')
soup = BeautifulSoup(wb_data.text,'lxml')
all = soup.select(ttS)

x = 0
for i in all:
urllib.request.urlretrieve(i.get('src'), 'dt/%s.jpg' % x)
x = x + 1

请勿用于商业咯~

另外今天写了一个爬取二手冰箱的

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#-*- coding: utf8 -*-
from bs4 import BeautifulSoup
from pymongo import MongoClient
import requests

client = MongoClient('localhost',27017)
db = client['db']
bx = db['bx']
gbx = db['gbx']

ttS = '#infolist > div.infocon > table > tbody > tr > td.t > a'
ppS = '#infolist > div.infocon > table > tbody > tr > td.t > span.pricebiao > span'
ddS = '#infolist > div.infocon > table > tbody > tr > td.t > span.desc'
header = {
'Cookie':'f=n; f=n; id58=c5/njVeF2seOw3jQEnUqAg==; als=0; bj58_id58s="c2tBeEE1UGpmLWxyMDA1Nw=="; bdshare_firstime=1468390750533; __utma=253535702.1519012623.1468390701.1468390701.1468390701.1; __utmz=253535702.1468390701.1.1.utmcsr=cs.58.com|utmccn=(referral)|utmcmd=referral|utmcct=/; Hm_lvt_3bb04d7a4ca3846dcc66a99c3e861511=1468390768; bangbigtip2=1; Hm_lvt_ef9ab733a6772ffc77e498ec3aee46bd=1470723993; final_history=26973010446666%2C26972994928313%2C26972613537589%2C26972732029743%2C26479081252529; city=gz; 58home=gz; ipcity=cs%7C%u957F%u6C99%7C0; myfeet_tooltip=end; sessionid=575563fa-ecbf-4cbd-bb83-a2a7a30be6e4; f=n; bj58_new_session=0; bj58_init_refer="http://gz.58.com/"; bj58_new_uv=5; 58tj_uuid=5fa8a0ce-253e-4b83-ad71-7f91586401b4; new_session=0; new_uv=8; utm_source=; spm=; init_refer=https%253A%252F%252Fwww.google.com.hk%252F',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
}


gttS = '#wrapper > div.leftBox > div.layoutlist > dl > dd.feature > div > ul > li > a'
gddS = '#wrapper > div.leftBox > div.layoutlist > dl > dd.feature > p'
gppS = '#wrapper > div.leftBox > div.layoutlist > dl > dd.pt-other > p > span'


headers = {
'Cookie':'ganji_uuid=3031153094415879005517; ganji_xuuid=3d4ba517-3b7d-4f20-e0d7-05de7a77b9c9.1459734446990; t3=2; citydomain=gz; statistics_clientid=me; GANJISESSID=d0e973e2c407e720c03eff165ca9fe49; __utmt=1; STA_DS=1; ganji_login_act=1471756690422; lg=1; __utma=32156897.1667757214.1468391541.1471009400.1471755417.6; __utmb=32156897.6.10.1471755417; __utmc=32156897; __utmz=32156897.1471755417.6.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _gl_tracker=%7B%22ca_source%22%3A%22www.baidu.com%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22seo_baidu%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A46616764562%7D; _gl_speed=%5B%22%2Fbingxiang%2Fo30%2F%22%2C1471756700728%5D',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
}

def GetUrl(url):
wb_data = requests.get(url,timeout = 2,headers = header)
wb_data.encoding = 'utf8'
# if wb_data.status_code == 200:
# print('Fail to Get ', url)
# else:
soup = BeautifulSoup(wb_data.text,'lxml')
title = soup.select(ttS)
price = soup.select(ppS)
descr = soup.select(ddS)
for a,b,c in zip(title,price,descr):
bx.insert_one(dict(title=a.get_text(),
price=b.get_text(),
descr=c.get_text(),
href =a.get('href')))
print('success get ',url)

urls = ['http://gz.58.com/bingxiang/pn{}'.format(i) for i in range(1,70)]
#for u in urls:
# GetUrl(u)




def gGetUrl(url):
wb_data = requests.get(url,timeout = 2,headers = headers)
wb_data.encoding = 'utf8'
# if wb_data.status_code == 200:
# print('Fail to Get ', url)
# else:
soup = BeautifulSoup(wb_data.text,'lxml')
title = soup.select(gttS)
descr = soup.select(gddS)
price = soup.select(gppS)
for a, b, c in zip(title, price, descr):
if b.get_text().strip() != '超便宜':
bx.insert_one(dict(title=a.get_text().strip(),
price=b.get_text().strip(),
descr=c.get_text().strip(),
href=a.get('href')))
print('success get ',url)
gurls = ['http://gz.ganji.com/bingxiang/o{}/'.format(i) for i in range(2,32)]

for dt in gurls:
gGetUrl(dt)

筛选

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#-*- coding: utf8 -*-
from pymongo import MongoClient
import csv
client = MongoClient('localhost',27017)
db = client['db']
bx = db['bx']
gbx = db['gbx']

info = ['title','price','descr','href','_id']

testFile = open('test1.csv', 'w')
writer = csv.writer(testFile)
writer.writerow(info)
writer = csv.DictWriter(testFile, info)
for item in bx.find():
if int(item['price']) < 300:
writer.writerow(item)

testFile.close()
文章目录