Python实战计划 Begining for MongoDB

对小猪租房再度下手 MongoDB

MongoDB

环境搭建(Mongod in Mac)

1
2
3
4
5
6
7
8
9
10
11
sudo /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"

brew update #这一步也许不行...

sudo chown -R $(whoami) /usr/local #这一步很关键..

brew install mongodb

cd /
sudo mkdir -p /data/db
sudo chown -R radar_sir /data/db

pycharm插件测试代码

(之后refresh)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
from pymongo import MongoClient

host = 'localhost'
port = 27017

client = MongoClient(host,port)
db = client['test']
sheet = db['sheet']
for i in range(1001):
print(i)
sheet.insert_one({
'name': 'name' + str(i),
'age': i
})

基本使用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#启动
brew services list
brew services start mongodb
mongod
###上面都是骗人的--,
#去下载的mongodb/bin里面 命令行打开mongo
#下面也是在那个文件夹里面打开

#查询
mongo
show dbs
db

# 转移数据库
use liw
# 读取数据
db.sheet_out.find()
# db.col.find().pretty()
&&
# db.col.find({key1:value1, key2:value2}).pretty()
||
# db.col.find({$or:[{"by":"菜鸟教程"},{"title": "MongoDB 教程"}]}).pretty()
>=
db.col.find({likes : {$gte : 100}})
type
db.col.find({"title" : {$type : 2}}) # 2 -> String, 1 -> Double

# 对应关系
# client = MongoClient('localhost', 27017)
# ilw = client['ilw']
# sheet_out = ilw['sheet_out']

#命令行数据导出
#json
mongoexport -d local(数据库名称) -c start_long(Collection名称) -o startup_log.json(导出文件名称)
#csv ???
mongoexport -d local(数据库名称) -c startup_log(collection名称) --csv -f(字段名称) hostname,startTime -o startup_log.csv(导出文件名称)

# example
mongoexport -d ilw -c sheet_out --csv -f title,place,price,comment -o sheet_out.csv
mongoexport -d dataBase -c SecondUrl --csv -f title,href -o href.csv
mongoexport -d dataBase -c Info --csv -f title,type,time,price,new,place -o information.csv
mongoexport -d dataBase -c analy --csv -f type,month,InforNum -o analy.csv



2016-08-08T14:29:35.978+0800 csv flag is deprecated; please use --type=csv instead
2016-08-08T14:29:35.986+0800 connected to: localhost
2016-08-08T14:29:36.010+0800 exported 528 records
# 存在mongoexport相同文件夹下,效果如下

#数据导入
db.createCollection('log')
show collections

mongoimport -d local(数据库名称) -c log(Collection名称) --file startup_log.json(导出文件名称)
mongoimport -d local(数据库名称) -c log(Collection名称) --type csv --headerline --file startup_log.csv(导出文件名称)
#validating settings: must specify --fields, --fieldFile or --headerline to import this file type

##其它使用

###链接

1
2
3
4

链接
# 使用用户名和密码连接到MongoDB服务器,你必须使用 'username:password@hostname/dbname' 格式,'username'为用户名,'password' 为密码。
# mongodb://admin:123456@localhost/

###创建删除数据库

1
2
3
4
5
# 创建数据库
# pycharm
# use DATABASE_NAME
# 删除
# 转移到那个之后 db.dropDatabase()

###插入更新删除数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43

# 插入
db.COLLECTION_NAME.insert(document)
# db.col.insert({title: 'MongoDB 教程',
# description: 'MongoDB 是一个 Nosql 数据库',
# by: '菜鸟教程',
# url: 'http://www.runoob.com',
# tags: ['mongodb', 'database', 'NoSQL'],
# likes: 100
# })

# update
db.collection.update(
<query>, # update的查询条件,类似sql update查询内where后面的。
<update>,# update的对象和一些更新的操作符(如$,$inc...)等
{
upsert: <boolean>,# 如果不存在update的记录,是否插入objNew,true为插入
multi: <boolean>, # 如果这个参数为true,就把按条件查出来多条记录全部更新
writeConcern: <document># 抛出异常的级别。
}
)

# example
db.col.update({'title':'MongoDB 教程'},{$set:{'title':'MongoDB'}})
WriteResult({ "nMatched" : 1, "nUpserted" : 0, "nModified" : 1 }) # 输出信息

# save 覆盖原有
db.collection.save(
<document>,
{
writeConcern: <document>
}
)

# remove
db.collection.remove(
<query>,# 删除的文档的条件
{
justOne: <boolean>,
writeConcern: <document>
}
)
# db.col.remove({'title':'MongoDB 教程'})

###Limit,Skip,sort

1
2
3
4
5
6
7
8
9

db.COLLECTION_NAME.find().limit(NUMBER)# 读取的记录条数。
db.COLLECTION_NAME.find().skip(NUMBER) # 跳过的记录条数。
db.COLLECTION_NAME.find().sort({KEY:1})# 1 increase

正则表达式中使用变量。一定要使用eval将组合的字符串进行转换,不能直接将字符串拼接后传入给表达式。否则没有报错信息,只是结果为空!实例如下:
var name=eval("/" + 变量值key +"/i");
以下是模糊查询包含title关键词, 且不区分大小写:
title:eval("/"+title+"/i") // 等同于 title:{$regex:title,$Option:"$i"}

###regex

1
2
3
db.posts.find({post_text:{$regex:"w3cschool.cc"}})
# ignore Case
db.posts.find({post_text:{$regex:"w3cschool.cc",$options:"$i"}})

Exercise

课程代码&&笔记

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#-*- coding: utf8 -*-
import pymongo
# 链接mongodb

client = pymongo.MongoClient('localhost',27017)
# 激活本地客户端client
# localhost : 本地
# 27017 : 端口 不要写成27817..

waxtu = client['waxtu']
# 创建数据库表格

sheet_tab = waxtu['sheet_tab']
# 创建表单名称

path = 'walden.txt'

with open(path,'r') as f:
lines = f.readlines()
for index, line in enumerate(lines):
data = dict(index=index,line=line,words=len(line.split()))
#print(data) 检查是否读取..
sheet_tab.insert_one(data)
# 存储数据

for item in sheet_tab.find():
print(item)
#{'_id': ObjectId('57a80116d6616918def0fbfa'), 'index': 9703, 'words': 1, 'line': '.\n'}
# 检查数据是否存入
# 另一种方法 refresh数据库插件

for item in sheet_tab.find({'words':0}):
print(item['line'])

# $lt/ $lte/ $gt/ $gte/ $ne,依次等价于</ <=/ >/ >=/ !=。
# (l表示less g表示greater e表示equal n表示not )
for item in sheet_tab.find({'words':{'$gt':5}}):
print(item)

Task

爬虫内容和这一章基本是一样的

Code

爬取信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#-*- coding: utf8 -*-
from bs4 import BeautifulSoup
import requests
import time

PriceSel = '#page_list > ul > li > div.result_btm_con.lodgeunitname > span.result_price > i'
TitleSel = '#page_list > ul > li > div.result_btm_con.lodgeunitname > div > a > span'
CommentSel = '#page_list > ul > li > div.result_btm_con.lodgeunitname > div > em > span'
UrlSel = '#page_list > ul > li > div.result_btm_con.lodgeunitname'
PlaceSel = '#page_list > ul > li > div.result_btm_con.lodgeunitname > div > em'

Testurl = 'http://cs.xiaozhu.com' # attention for adding http://
headers = {
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36',
'Cookie' : 'abtest_ABTest4SearchDate=b; _gat_UA-33763849-7=1; __utmt=1; xzuuid=b2e6cf19; OZ_1U_2282=vid=v7a566a4734c93.0&ctime=1470629052&ltime=1470629048; OZ_1Y_2282=erefer=https%3A//www.baidu.com/link%3Furl%3DqGXg8_L7_OKuYp3jvCSU2LCcrIH5r8_CN22-4NrCKWWUCQRu4q4wOdeB4q2b_Zg9%26wd%3D%26eqid%3Db72a32fa000083cd0000000557a80293&eurl=http%3A//www.xiaozhu.com/&etime=1470628619&ctime=1470629052&ltime=1470629048&compid=2282; _ga=GA1.2.206747787.1470457508; __utma=29082403.206747787.1470457508.1470469908.1470628620.3; __utmb=29082403.4.10.1470628620; __utmc=29082403; __utmz=29082403.1470628620.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic'
}

def GetInfo(url = Testurl):
time.sleep(1)
wb_data = requests.get(url,headers=headers)
soup = BeautifulSoup(wb_data.text,'lxml')

prices = soup.select(PriceSel)
titles = soup.select(TitleSel)
comments = soup.select(CommentSel)
urls = soup.select(UrlSel)
places = soup.select(PlaceSel)

for price,title,comment,url,place in zip(prices,titles,comments,urls,places):
data = dict(price=price.get_text(),title=title.get_text(),
place=place.get_text().split('\n')[-1].strip(),
comment=comment.get_text().split('\n')[1].strip(),
url=url.get('detailurl'))

存储处理/完整代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#-*- coding: utf8 -*-
from bs4 import BeautifulSoup
import requests
import time
from pymongo import MongoClient

client = MongoClient('localhost', 27017)
ilw = client['ilw']
sheet_out = ilw['sheet_out']



url = ['http://cs.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1,4)]

Testurl = 'http://cs.xiaozhu.com' # attention for adding http://
headers = {
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36',
'Cookie' : 'abtest_ABTest4SearchDate=b; _gat_UA-33763849-7=1; __utmt=1; xzuuid=b2e6cf19; OZ_1U_2282=vid=v7a566a4734c93.0&ctime=1470629052&ltime=1470629048; OZ_1Y_2282=erefer=https%3A//www.baidu.com/link%3Furl%3DqGXg8_L7_OKuYp3jvCSU2LCcrIH5r8_CN22-4NrCKWWUCQRu4q4wOdeB4q2b_Zg9%26wd%3D%26eqid%3Db72a32fa000083cd0000000557a80293&eurl=http%3A//www.xiaozhu.com/&etime=1470628619&ctime=1470629052&ltime=1470629048&compid=2282; _ga=GA1.2.206747787.1470457508; __utma=29082403.206747787.1470457508.1470469908.1470628620.3; __utmb=29082403.4.10.1470628620; __utmc=29082403; __utmz=29082403.1470628620.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic'
}

PriceSel = '#page_list > ul > li > div.result_btm_con.lodgeunitname > span.result_price > i'
TitleSel = '#page_list > ul > li > div.result_btm_con.lodgeunitname > div > a > span'
CommentSel = '#page_list > ul > li > div.result_btm_con.lodgeunitname > div > em > span'
UrlSel = '#page_list > ul > li > div.result_btm_con.lodgeunitname'
PlaceSel = '#page_list > ul > li > div.result_btm_con.lodgeunitname > div > em'

def GetInfo(url = Testurl):
time.sleep(1)
wb_data = requests.get(url,headers=headers)
soup = BeautifulSoup(wb_data.text,'lxml')

prices = soup.select(PriceSel)
titles = soup.select(TitleSel)
comments = soup.select(CommentSel)
urls = soup.select(UrlSel)
places = soup.select(PlaceSel)

for price,title,comment,url,place in zip(prices,titles,comments,urls,places):
data = dict(price=price.get_text(),title=title.get_text(),
place=place.get_text().split('\n')[-1].strip(),
comment=comment.get_text().split('\n')[1].strip(),
url=url.get('detailurl'))
sheet_out.insert_one(data)

def main():
for URL in url:
GetInfo(URL)
for items in sheet_out.find({'price':{'$gte': 500}}):
print(items)

if __name__ == '__main__':
main()

效果

总结

数据库的导入更好的处理了数据..

但是学习了基于命令行的数据导入导出之后,并不会迁移到pycharm里面还是有点可惜..

还是得调用csv库像前面几章那样做么QAQ

(已经解决√)再有windows下面的导入导出命令Mac下面没有数据来模拟…有的时候并不能得到对应的结果

文章目录
  1. 1. MongoDB
    1. 1.1. 环境搭建(Mongod in Mac)
    2. 1.2. pycharm插件测试代码
    3. 1.3. 基本使用
  2. 2. Exercise
    1. 2.1. 课程代码&&笔记
    2. 2.2. Task
    3. 2.3. Code
      1. 2.3.1. 爬取信息
      2. 2.3.2. 存储处理/完整代码
    4. 2.4. 效果
    5. 2.5. 总结