2016-08-07

Python实战计划 Begining for MongoDB

对小猪租房再度下手 MongoDB

MongoDB

环境搭建(Mongod in Mac)

sudo /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"

brew update #这一步也许不行...

sudo chown -R $(whoami) /usr/local #这一步很关键..

brew install mongodb

cd /
sudo mkdir -p /data/db
sudo chown -R radar_sir /data/db

pycharm插件测试代码

(之后refresh)

from pymongo import MongoClient

host = 'localhost'
port = 27017

client = MongoClient(host,port)
db = client['test']
sheet = db['sheet']
for i in range(1001):
    print(i)
    sheet.insert_one({
        'name': 'name' + str(i),
        'age': i
    })

基本使用

#启动
brew services list
brew services start mongodb
mongod
###上面都是骗人的--,
#去下载的mongodb/bin里面 命令行打开mongo
#下面也是在那个文件夹里面打开

#查询
mongo
show dbs
db

# 转移数据库
use liw
# 读取数据
db.sheet_out.find()
# db.col.find().pretty()
&&
# db.col.find({key1:value1, key2:value2}).pretty()
||
# db.col.find({$or:[{"by":"菜鸟教程"},{"title": "MongoDB 教程"}]}).pretty()
>= 
db.col.find({likes : {$gte : 100}})
type
db.col.find({"title" : {$type : 2}}) # 2 -> String, 1 -> Double

# 对应关系
# client = MongoClient('localhost', 27017)
# ilw  = client['ilw']
# sheet_out  = ilw['sheet_out']

#命令行数据导出
#json
mongoexport -d local(数据库名称) -c start_long(Collection名称) -o startup_log.json(导出文件名称)
#csv ???
mongoexport -d local(数据库名称) -c startup_log(collection名称) --csv -f(字段名称) hostname,startTime -o startup_log.csv(导出文件名称)

# example
mongoexport -d ilw -c sheet_out --csv -f title,place,price,comment -o sheet_out.csv
mongoexport -d dataBase -c SecondUrl --csv -f title,href -o href.csv
mongoexport -d dataBase -c Info --csv -f title,type,time,price,new,place -o information.csv
mongoexport -d dataBase -c analy --csv -f type,month,InforNum -o analy.csv



2016-08-08T14:29:35.978+0800	csv flag is deprecated; please use --type=csv instead
2016-08-08T14:29:35.986+0800	connected to: localhost
2016-08-08T14:29:36.010+0800	exported 528 records
# 存在mongoexport相同文件夹下,效果如下

#数据导入
db.createCollection('log')
show collections

mongoimport -d local(数据库名称) -c log(Collection名称) --file startup_log.json(导出文件名称)
mongoimport -d local(数据库名称) -c log(Collection名称) --type csv --headerline --file startup_log.csv(导出文件名称)
#validating settings: must specify --fields, --fieldFile or --headerline to import this file type

##其它使用

###链接


链接
# 使用用户名和密码连接到MongoDB服务器，你必须使用 'username:password@hostname/dbname' 格式，'username'为用户名，'password' 为密码。
# mongodb://admin:123456@localhost/

###创建删除数据库

# 创建数据库
# pycharm
# use DATABASE_NAME
# 删除
# 转移到那个之后 db.dropDatabase()

###插入更新删除数据


# 插入
db.COLLECTION_NAME.insert(document)
# db.col.insert({title: 'MongoDB 教程', 
#     description: 'MongoDB 是一个 Nosql 数据库',
#     by: '菜鸟教程',
#     url: 'http://www.runoob.com',
#     tags: ['mongodb', 'database', 'NoSQL'],
#     likes: 100
# })

# update
db.collection.update(
   <query>, # update的查询条件，类似sql update查询内where后面的。
   <update>,# update的对象和一些更新的操作符（如$,$inc...）等
   {
     upsert: <boolean>,# 如果不存在update的记录，是否插入objNew,true为插入
     multi: <boolean>, # 如果这个参数为true,就把按条件查出来多条记录全部更新
     writeConcern: <document># 抛出异常的级别。
   }
)

# example
db.col.update({'title':'MongoDB 教程'},{$set:{'title':'MongoDB'}})
WriteResult({ "nMatched" : 1, "nUpserted" : 0, "nModified" : 1 })   # 输出信息

# save 覆盖原有
db.collection.save(
   <document>,
   {
     writeConcern: <document>
   }
)

# remove
db.collection.remove(
   <query>,# 删除的文档的条件
   {
     justOne: <boolean>,
     writeConcern: <document>
   }
)
# db.col.remove({'title':'MongoDB 教程'})

###Limit,Skip,sort


db.COLLECTION_NAME.find().limit(NUMBER)# 读取的记录条数。
db.COLLECTION_NAME.find().skip(NUMBER) # 跳过的记录条数。
db.COLLECTION_NAME.find().sort({KEY:1})# 1 increase

正则表达式中使用变量。一定要使用eval将组合的字符串进行转换，不能直接将字符串拼接后传入给表达式。否则没有报错信息，只是结果为空！实例如下：
var name=eval("/" + 变量值key +"/i"); 
以下是模糊查询包含title关键词, 且不区分大小写:
title:eval("/"+title+"/i")    // 等同于 title:{$regex:title,$Option:"$i"}

###regex

1
2
3

db.posts.find({post_text:{$regex:"w3cschool.cc"}})
# ignore Case
db.posts.find({post_text:{$regex:"w3cschool.cc",$options:"$i"}})

Exercise

课程代码&&笔记

#-*- coding: utf8 -*-
import pymongo
# 链接mongodb

client = pymongo.MongoClient('localhost',27017)
# 激活本地客户端client
# localhost : 本地
# 27017 : 端口 不要写成27817..

waxtu = client['waxtu']
# 创建数据库表格

sheet_tab = waxtu['sheet_tab']
# 创建表单名称

path = 'walden.txt'

with open(path,'r') as f:
    lines = f.readlines()
    for index, line in enumerate(lines):
        data = dict(index=index,line=line,words=len(line.split()))
        #print(data) 检查是否读取..
        sheet_tab.insert_one(data)
# 存储数据

for item in sheet_tab.find():
    print(item)
    #{'_id': ObjectId('57a80116d6616918def0fbfa'), 'index': 9703, 'words': 1, 'line': '.\n'}
# 检查数据是否存入
# 另一种方法 refresh数据库插件

for item in sheet_tab.find({'words':0}):
    print(item['line'])

# $lt/ $lte/ $gt/ $gte/ $ne，依次等价于</ <=/ >/ >=/ !=。
# (l表示less g表示greater e表示equal n表示not  ）
for item in sheet_tab.find({'words':{'$gt':5}}):
    print(item)

Task

爬虫内容和这一章基本是一样的

Code

爬取信息

#-*- coding: utf8 -*-
from bs4 import BeautifulSoup
import requests
import time

PriceSel = '#page_list > ul > li > div.result_btm_con.lodgeunitname > span.result_price > i'
TitleSel = '#page_list > ul > li > div.result_btm_con.lodgeunitname > div > a > span'
CommentSel  = '#page_list > ul > li > div.result_btm_con.lodgeunitname > div > em > span'
UrlSel = '#page_list > ul > li > div.result_btm_con.lodgeunitname'
PlaceSel = '#page_list > ul > li > div.result_btm_con.lodgeunitname > div > em'

Testurl = 'http://cs.xiaozhu.com' # attention for adding http://
headers = {
    'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36',
    'Cookie' : 'abtest_ABTest4SearchDate=b; _gat_UA-33763849-7=1; __utmt=1; xzuuid=b2e6cf19; OZ_1U_2282=vid=v7a566a4734c93.0&ctime=1470629052&ltime=1470629048; OZ_1Y_2282=erefer=https%3A//www.baidu.com/link%3Furl%3DqGXg8_L7_OKuYp3jvCSU2LCcrIH5r8_CN22-4NrCKWWUCQRu4q4wOdeB4q2b_Zg9%26wd%3D%26eqid%3Db72a32fa000083cd0000000557a80293&eurl=http%3A//www.xiaozhu.com/&etime=1470628619&ctime=1470629052&ltime=1470629048&compid=2282; _ga=GA1.2.206747787.1470457508; __utma=29082403.206747787.1470457508.1470469908.1470628620.3; __utmb=29082403.4.10.1470628620; __utmc=29082403; __utmz=29082403.1470628620.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic'
}

def GetInfo(url = Testurl):
    time.sleep(1)
    wb_data = requests.get(url,headers=headers)
    soup = BeautifulSoup(wb_data.text,'lxml')

    prices = soup.select(PriceSel)
    titles = soup.select(TitleSel)
    comments = soup.select(CommentSel)
    urls = soup.select(UrlSel)
    places = soup.select(PlaceSel)

    for price,title,comment,url,place in zip(prices,titles,comments,urls,places):
        data = dict(price=price.get_text(),title=title.get_text(),
                    place=place.get_text().split('\n')[-1].strip(),
                    comment=comment.get_text().split('\n')[1].strip(),
                    url=url.get('detailurl'))

存储处理/完整代码

#-*- coding: utf8 -*-
from bs4 import BeautifulSoup
import requests
import time
from pymongo import MongoClient

client = MongoClient('localhost', 27017)
ilw  = client['ilw']
sheet_out  = ilw['sheet_out']



url = ['http://cs.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1,4)]

Testurl = 'http://cs.xiaozhu.com' # attention for adding http://
headers = {
    'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36',
    'Cookie' : 'abtest_ABTest4SearchDate=b; _gat_UA-33763849-7=1; __utmt=1; xzuuid=b2e6cf19; OZ_1U_2282=vid=v7a566a4734c93.0&ctime=1470629052&ltime=1470629048; OZ_1Y_2282=erefer=https%3A//www.baidu.com/link%3Furl%3DqGXg8_L7_OKuYp3jvCSU2LCcrIH5r8_CN22-4NrCKWWUCQRu4q4wOdeB4q2b_Zg9%26wd%3D%26eqid%3Db72a32fa000083cd0000000557a80293&eurl=http%3A//www.xiaozhu.com/&etime=1470628619&ctime=1470629052&ltime=1470629048&compid=2282; _ga=GA1.2.206747787.1470457508; __utma=29082403.206747787.1470457508.1470469908.1470628620.3; __utmb=29082403.4.10.1470628620; __utmc=29082403; __utmz=29082403.1470628620.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic'
}

PriceSel = '#page_list > ul > li > div.result_btm_con.lodgeunitname > span.result_price > i'
TitleSel = '#page_list > ul > li > div.result_btm_con.lodgeunitname > div > a > span'
CommentSel  = '#page_list > ul > li > div.result_btm_con.lodgeunitname > div > em > span'
UrlSel = '#page_list > ul > li > div.result_btm_con.lodgeunitname'
PlaceSel = '#page_list > ul > li > div.result_btm_con.lodgeunitname > div > em'

def GetInfo(url = Testurl):
    time.sleep(1)
    wb_data = requests.get(url,headers=headers)
    soup = BeautifulSoup(wb_data.text,'lxml')

    prices = soup.select(PriceSel)
    titles = soup.select(TitleSel)
    comments = soup.select(CommentSel)
    urls = soup.select(UrlSel)
    places = soup.select(PlaceSel)

    for price,title,comment,url,place in zip(prices,titles,comments,urls,places):
        data = dict(price=price.get_text(),title=title.get_text(),
                    place=place.get_text().split('\n')[-1].strip(),
                    comment=comment.get_text().split('\n')[1].strip(),
                    url=url.get('detailurl'))
        sheet_out.insert_one(data)

def main():
    for URL in url:
        GetInfo(URL)
    for items in sheet_out.find({'price':{'$gte': 500}}):
        print(items)

if __name__ == '__main__':
    main()

效果

总结

数据库的导入更好的处理了数据..

但是学习了基于命令行的数据导入导出之后,并不会迁移到pycharm里面还是有点可惜..

还是得调用csv库像前面几章那样做么QAQ

(已经解决√)再有windows下面的导入导出命令Mac下面没有数据来模拟…有的时候并不能得到对应的结果

本文标题:Python实战计划 Begining for MongoDB

文章作者:Renld

发布时间:2016年08月07日 - 18时15分

最后更新:2016年08月11日 - 16时01分

原始链接:http://renld.github.io/2016/08/07/PL2-1/

许可协议: "署名-非商用-相同方式共享 3.0" 转载请保留原文链接及作者。