几家旅游网站的爬虫
原文链接 http://code.lujq.me/2015/12/25/%E5%87%A0%E5%AE%B6%E6%97%85%E6%B8%B8%E7%BD%91%E7%AB%99%E7%9A%84%E7%88%AC%E8%99%AB/
注:以下为加速网络访问所做的原文缓存,经过重新格式化,可能存在格式方面的问题,或偶有遗漏信息,请以原文为准。
在Scrapy框架下 <!--more--> 蚂蜂窝
# coding=utf-8
import json
from urlparse import urljoin
import re
import logging
import scrapy
from scrapy.http import Request
from scrapy.selector import Selector
from andaman.utils.html import html2text, parse_time
from andaman.items.qa import QAItem
from andaman.items.jieban import JiebanItem
__author__ = 'zephyre'
class MafengwoQaSpider(scrapy.Spider):
name = 'mafengwo-qa'
def parse(self, response):
html_text = json.loads(response.body)['payload']['list_html']
for href in Selector(text=html_text).xpath(
'//li/div[@class="wen"]//div[@class="title"]/a[@href]/@href').extract():
url = urljoin(response.url, href)
yield Request(url=url, callback=self.parse_question)
def start_requests(self):
for start_idx in xrange(0, 500, 20):
yield Request(url='http://www.mafengwo.cn/qa/ajax_pager.php?action=question_index&start=%d' % start_idx)
def parse_question(self, response):
# 抓取相关问题
for related_href in response.selector.xpath(
'//div[@class="q-relate"]/ul[@class="bd"]/li/a[@href]/@href').extract():
url = urljoin(response.url, related_href)
yield Request(url=url, callback=self.parse_question)
q_item = self.retrive_question(response)
yield q_item
# 抓取回答
qid = q_item['qid']
page = 0
page_size = 50
url = 'http://www.mafengwo.cn/qa/ajax_pager.php?qid=%d&action=question_detail&start=%d' \
% (qid, page * page_size)
yield Request(url=url, callback=self.parse_answer_list, meta={'qid': qid, 'page': page, 'page_size': page_size})
def retrive_question(self, response):
"""
分析response,得到问题
"""
tmp = response.selector.xpath('//div[@class="q-detail"]/div[@class="person"]/div[@class="avatar"]/a[@href]')
try:
user_href = tmp[0].xpath('./@href').extract()[0]
except IndexError:
self.logger.warning('Invalid response: %s' % response.url)
self.logger.warning(response.body)
raise
m = re.search(r'/wenda/u/(\d+)', user_href)
author_id = int(m.group(1))
tmp = tmp[0].xpath('./img/@src').extract()[0]
author_avatar = re.sub(r'\.head\.w\d+\.', '.', tmp)
if author_avatar.endswith('pp48.gif'):
author_avatar = None
author_name = response.selector.xpath(
'//div[@class="q-content"]/div[@class="user-bar"]/a[@class="name"]/text()').extract()[0]
title = response.selector.xpath('//div[@class="q-content"]/div[@class="q-title"]/h1/text()').extract()[0]
raw_contents = \
response.selector.xpath('//div[@class="q-content"]/div[@class="q-info"]/div[@class="q-desc"]').extract()[0]
contents = html2text(raw_contents)
tmp = response.selector.xpath(
'//div[@class="q-content"]/div[@class="user-bar"]//span[@class="visit"]/text()').extract()[0]
view_cnt = int(re.search(ur'(\d+)\s*浏览', tmp).group(1))
time_str = response.selector.xpath(
'//div[@class="q-content"]/div[@class="user-bar"]//span[@class="time"]/text()').extract()[0]
timestamp = parse_time(time_str)
tmp = response.selector.xpath(
'//div[@class="q-content"]/div[@class="user-bar"]/span[@class="fr"]/a[@href]/text()').extract()
if tmp and tmp[0].strip():
topic = tmp[0].strip()
else:
topic = None
raw_tags = response.selector.xpath(
'//div[@class="q-content"]/div[@class="q-info"]/div[@class="q-tags"]/a[@class="a-tag"]/text()').extract()
tags = [tmp.strip() for tmp in raw_tags if tmp.strip()]
match = re.search(r'detail-(\d+)\.html', response.url)
qid = int(match.group(1))
item = QAItem()
item['source'] = 'mafengwo'
item['type'] = 'question'
item['qid'] = qid
item['title'] = title
item['author_nickname'] = author_name
item['author_id'] = author_id
if author_avatar:
item['author_avatar'] = author_avatar
item['file_urls'] = [author_avatar]
item['timestamp'] = timestamp
if topic:
item['topic'] = topic
item['contents'] = contents
item['tags'] = tags
item['view_cnt'] = view_cnt
return item
def parse_answer_list(self, response):
meta = response.meta
qid = meta['qid']
page = meta['page']
page_size = meta['page_size']
sel = Selector(text=json.loads(response.body)['payload']['list_html'])
answer_nodes = sel.xpath('//li[contains(@class, "answer-item")]')
if not answer_nodes:
return
# 查找下一页
if len(answer_nodes) == page_size:
next_page = page + 1
url = 'http://www.mafengwo.cn/qa/ajax_pager.php?qid=%d&action=question_detail&start=%d' \
% (qid, next_page * page_size)
yield Request(url=url, callback=self.parse_answer_list,
meta={'qid': qid, 'page': next_page, 'page_size': page_size})
for answer_node in sel.xpath('//li[contains(@class, "answer-item") and @data-aid]'):
aid = int(answer_node.xpath('./@data-aid').extract()[0])
author_node = answer_node.xpath('./div[@class="person"]/div[contains(@class, "avatar") and @data-uid]')[0]
author_id = int(author_node.xpath('./@data-uid').extract()[0])
tmp = author_node.xpath('./a/img/@src').extract()[0]
author_avatar = re.sub(r'\.head\.w\d+\.', '.', tmp)
if author_avatar.endswith('pp48.gif'):
author_avatar = None
content_node = answer_node.xpath('./div[contains(@class,"answer-content")]')[0]
author_name = content_node.xpath('./div[@class="user-bar"]/a[@class="name"]/text()').extract()[0]
time_str = content_node.xpath('./div[@class="user-bar"]//span[@class="time"]/text()').extract()[0]
timestamp = parse_time(time_str)
accepted = bool(answer_node.xpath('.//div[contains(@class,"answer-best")]'))
raw_contents = content_node.xpath('.//dl/dd[@class="_j_answer_html"]').extract()[0]
contents = html2text(raw_contents)
try:
vote_cnt = int(answer_node.xpath('.//a[@class="btn-zan"]/span/text()').extract()[0])
except (IndexError, ValueError):
self.logger.debug(u'Invalid vote count: %s' % answer_node.extract()[0])
vote_cnt = 0
item = QAItem()
item['type'] = 'answer'
item['source'] = 'mafengwo'
item['qid'] = qid
item['aid'] = aid
item['author_nickname'] = author_name
item['author_id'] = author_id
if author_avatar:
item['author_avatar'] = author_avatar
item['file_urls'] = [author_avatar]
item['timestamp'] = timestamp
item['contents'] = contents
item['vote_cnt'] = vote_cnt
item['accepted'] = accepted
yield item
class MafengwoSpider(scrapy.Spider):
name = "mafengwo-jieban"
allowed_domains = ["mafengwo.cn"]
def start_requests(self):
total_page = self.crawler.settings.getint('MAFENGWO_JIEBAN_PAGES', 10)
session_id = self.crawler.settings.get('MAFENGWO_SESSION_ID')
cookies = {'PHPSESSID': session_id} if session_id else {}
for i in range(total_page):
url = 'http://www.mafengwo.cn/together/ajax.php?act=getTogetherMore&flag=3&offset=%d&mddid=0&timeFlag=1' \
'×tart=' % i
yield scrapy.Request(url, cookies=cookies)
def parse(self, response):
hrefs = scrapy.Selector(text=json.loads(response.body)['data']['html']).xpath('//li/a/@href').extract()
for href in hrefs:
url = 'http://www.mafengwo.cn/together/' + href
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
tid = int(str(response.xpath('//script[1]/text()').re(r'"tid":\d+')[0])[6:])
url = 'http://www.mafengwo.cn/together/ajax.php?act=moreComment&page=%d&tid=%d' % (0, tid)
total = int(str(response.xpath('//script[1]/text()').re(r'"total":\d+')[0][8:])) / 10 + 1
summary = response.xpath('//div[@class="summary"]')
item = JiebanItem()
item['source'] = 'mafengwo'
item['title'] = response.xpath('//title/text()').extract()[0]
item['start_time'] = summary.xpath('//div[@class="summary"]/ul/li[1]/span/text()').extract()[0].encode("UTF-8")[
15:]
item['days'] = summary.xpath('//div[@class="summary"]/ul/li[2]/span/text()').extract()[0].encode("UTF-8")[9:]
item['destination'] = summary.xpath('//div[@class="summary"]/ul/li[3]/span/text()').extract()[0].encode(
"UTF-8")[12:].split("/")
item['departure'] = summary.xpath('//div[@class="summary"]/ul/li[4]/span/text()').extract()[0].encode("UTF-8")[
12:]
item['people'] = summary.xpath('//div[@class="summary"]/ul/li[5]/span/text()').extract()[0].encode("UTF-8")[15:]
item['description'] = '\n'.join(filter(lambda v: v, [tmp.strip() for tmp in summary.xpath(
'//div[@class="desc _j_description"]/text()').extract()])).encode("UTF-8")
item['author_avatar'] = summary.xpath('//div[@class="sponsor clearfix"]/a/img/@src').extract()[0].encode(
"UTF-8")
item['comments'] = []
item['tid'] = tid
yield scrapy.Request(url,
meta={'item': item, 'page': 0, 'total': total, 'tid': tid}, callback=self.parse_comments)
def parse_comments(self, response):
item = response.meta['item']
page = response.meta['page'] + 1
body = scrapy.Selector(text=json.loads(response.body)['data']['html'])
if body.extract() != '<html></html>':
for node in body.xpath('//div[@class="vc_comment"]'):
try:
author_avatar = node.xpath('.//div[@class= "avatar"]/a/img/@src').extract()[0].encode("UTF-8")
author = node.xpath('.//a[@class="comm_name"]/text()').extract()[0].encode("UTF-8")
cid = int(node.xpath('.//div[@class="comm_reply"]/a/@data-cid').extract()[0].encode("UTF-8"))
comment = '\n'.join(
filter(lambda v: v, [tmp.strip() for tmp in node.xpath('.//p/text()').extract()])).encode(
"UTF-8")
comment_item = {'cid': cid, 'author_avatar': author_avatar, 'author': author, 'comment': comment}
item['comments'].append(comment_item)
except IndexError:
self.logger.warning('Unable to extract comment from: %s' % (node.extract()))
if page <= response.meta['total']:
url = 'http://www.mafengwo.cn/together/ajax.php?act=moreComment&page=%d&tid=%d' % (page, item['tid'])
yield scrapy.Request(url, meta={'item': item, 'page': page, 'total': response.meta['total']},
callback=self.parse_comments)
else:
yield item
pintour
# coding=utf-8
import json
from urlparse import urljoin
import re
import logging
import scrapy
from scrapy.http import Request
from scrapy.selector import Selector
from andaman.utils.html import html2text, parse_time
from andaman.items.jieban import JiebanItem
class PintourSpider(scrapy.Spider):
name = 'pintour'
allowed_domains = ['pintour.com']
def start_requests(self):
total_page = self.crawler.settings.getint('MAFENGWO_JIEBAN_PAGES', 10)
for i in range(1, total_page):
url = 'http://www.pintour.com/list/0-0-0-0-2-1-s-0_%d' % i
yield scrapy.Request(url)
def parse(self, response):
metalist = Selector(text=response.body).xpath('//ul[@class="mateList"]/li/div/h3/a/@href').extract()
for href in metalist:
tid = int(href[1:])
url = 'http://www.pintour.com/%d' % tid
yield Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
item = JiebanItem()
item['source'] = 'pintour'
item['tid'] = int(response.url.split('/')[3])
item['title'] = response.xpath('//title/text()').extract()[0]
data = response.xpath('//div[@class="colBox clearfix"]')[0]
item['author'] = data.xpath('//div[@class="colBoxL clearfix"]/dl/dt/a/text()').extract()[0]
item['author_avatar'] = data.xpath('//div[@class="colBoxL clearfix"]/a/img/@src').extract()[0]
item['type'] = data.xpath('//div[@class="colBoxR"]/div//a/span/text()').extract()
time = data.xpath('.//div[@class="timePlace clearfix"]/p/text()').extract()[0]
item['start_time'] = time
item['departure'] = data.xpath('.//div[@class="timePlace clearfix"]/p[@class="plrCon"]/a/text()').extract()[0]
item['destination'] = data.xpath('.//div[@class="timePlace clearfix"]/p[@class="plrCon"]/a/text()').extract()
del item['destination'][0]
item['description'] = ' '.join(
filter(lambda v: v, [tmp.strip() for tmp in data.xpath('//div[@class="colBoxB"]//text()').extract()]))
item['comments'] = []
if re.search(r'\d+条回应', response.body):
reply_num = int(re.search(r'\d+条回应', response.body).group(0)[:-9])
total = reply_num / 20 + 1
url = 'http://www.pintour.com/%d_1' % item['tid']
yield Request(url,
meta={'item': item, 'page': 1, 'total': total, 'tid': item['tid']}, callback=self.parse_comments)
def parse_comments(self, response):
item = response.meta['item']
page = response.meta['page'] + 1
for node in response.xpath('//ul[@class="reply"]/li'):
author = node.xpath('.//div/input/@value').extract()[0]
author_avatar = node.xpath('.//a/img/@src').extract()[0]
comment = node.xpath('.//div/input/@value').extract()[2]
cid = int(node.xpath('.//div/@class').extract()[0].encode('UTF-8')[10:])
comment_item = {'cid': cid, 'author_avatar': author_avatar, 'author': author, 'comment': comment}
item['comments'].append(comment_item)
if page <= response.meta['total']:
url = 'http://www.pintour.com/%d_%d' % (item['tid'], page)
yield Request(url, meta={'item': item, 'page': page, 'total': response.meta['total']},
callback=self.parse_comments)
else:
yield item
ctrip
# coding=utf-8
import json
from urlparse import urljoin
import re
import logging
import scrapy
from scrapy.http import Request
from scrapy.http import FormRequest
from scrapy.selector import Selector
from andaman.utils.html import html2text, parse_time
from andaman.items.jieban import JiebanItem
class CtripSpider(scrapy.Spider):
name = 'ctrip'
def start_requests(self):
start_urls = [
'http://vacations.ctrip.com/tours',
'http://vacations.ctrip.com/tours/inter'
]
for url in start_urls:
yield Request(url)
def parse(self, response):
# 爬取城市列表
for city in response.xpath('//div[@class="sel_list"]/dl/dd/a/@href').extract():
num = int(re.search(r'\d+', str(city)).group(0))
url = 'http://you.ctrip.com/DangdiSite/events/%d.html' % num
yield Request(url, callback=self.parse_city)
def parse_city(self, response):
#爬取每个城市对应的页面的文章列表
for href in response.xpath('//ul[@class="cf"]/li/a/@href').extract():
url = urljoin(response.url, href)
yield Request(url, callback=self.parse_article)
def parse_article(self, response):
item = JiebanItem()
item['title'] = response.xpath('//title/text()').extract()[0]
item['tid'] = int(response.url.split('/')[5].split('.')[0])
if response.xpath('//div[@class="gsn-inputbox"]/input[@id="receiver_id"]/../input[@type="text"]/@value').extract():
item['author'] = response.xpath('//div[@class="gsn-inputbox"]/input[@id="receiver_id"]/../input[@type="text"]/@value').extract()[0]
else:
item['author'] = ''
eventsummaryinfoview = response.xpath('//div[@id="eventsummaryinfoview"]')
if eventsummaryinfoview.xpath('./p/span[@class="littlepadding"]/text()').extract():
item['start_time'] = eventsummaryinfoview.xpath('./p/span[@class="littlepadding"]/text()').extract()[0]
else:
item['start_time'] = ''
if eventsummaryinfoview.xpath('//p[@class="events_time"]/text()').extract():
item['days'] = eventsummaryinfoview.xpath('//p[@class="events_time"]/text()').extract()[2]
else:
item['days'] = ''
if eventsummaryinfoview.xpath('//p[@class="events_place"]/text()').extract():
item['departure'] = eventsummaryinfoview.xpath('//p[@class="events_place"]/text()').extract()[1]
else:
item['departure'] = ''
if eventsummaryinfoview.xpath('//p[@class="events_place"]/text()').extract():
item['destination'] = eventsummaryinfoview.xpath('//p[@class="events_place"]/text()').extract()[2]
else:
item['destination'] = ''
if eventsummaryinfoview.xpath('//p[@class="events_tag"]/a/span/text()').extract():
item['type'] = eventsummaryinfoview.xpath('//p[@class="events_tag"]/a/span/text()').extract()[0]
else:
item['type'] = ''
if response.xpath('//div[@class="events_infotext"]/p/text()').extract():
item['description'] = ' '.join(filter(lambda v: v, [tmp.strip() for tmp in response.xpath('//div[@class="events_infotext"]/p/text()').extract()]))
else:
item['description'] = ''
item['comments'] = []
frmdata = {"page": "1", "eventId": str(item['tid'])}
url = 'http://you.ctrip.com/CommunitySite/Activity/EventDetail/EventReplyListOrCommentList'
yield FormRequest(url, formdata=frmdata, method='POST',
meta={'item': item, 'page': 0}, callback=self.parse_comments)
def parse_comments(self, response):
logging.info(response.body)
pass
items
# coding=utf-8
import scrapy
class JiebanItem(scrapy.Item):
# 数据来源
source = scrapy.Field()
#标题
title = scrapy.Field()
#出发时间
start_time = scrapy.Field()
#天数
days = scrapy.Field()
#出发地
destination = scrapy.Field()
#目的地
departure = scrapy.Field()
#预订人数
people= scrapy.Field()
#文章描述
description = scrapy.Field()
#作者头像URL
author_avatar = scrapy.Field()
#评论
comments = scrapy.Field()
#文章id
tid = scrapy.Field()
#旅行方式
type = scrapy.Field()
#文章作者
author = scrapy.Field()
pipelines
# coding=utf-8
from datetime import datetime
from mongoengine import Document, EmbeddedDocument, EmbeddedDocumentField, StringField, IntField, ListField, connect
import logging
__author__ = 'golmic'
class Comments(EmbeddedDocument):
# 评论内容
comment = StringField()
# 评论作者
author = StringField()
# 作者头像Url
author_avatar = StringField()
# 评论id
cid = IntField()
class JiebanDocument(Document):
# 数据来源
source = StringField()
#文章标题
title = StringField()
# 出发时间
startTime = StringField()
# 预计天数
days = StringField()
# 目的地
destination = ListField()
# 出发地
departure = StringField()
# 预计人数
groupSize = StringField()
# 文章描述
description = StringField()
# 作者头像Url
authorAvatar = StringField()
# 文章id
tid = IntField()
# 文章评论
comments = ListField(EmbeddedDocumentField(Comments))
#作者
author = StringField()
#旅行方式
type = StringField()
class JiebanPipeline(object):
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('PIPELINE_JIEBAN_ENABLED', False):
from scrapy.exceptions import NotConfigured
raise NotConfigured
return cls(crawler.settings)
def __init__(self, settings):
self._conn = {}
self.init_db(settings)
@staticmethod
def init_db(settings):
mongo_uri = settings.get('ANDAMAN_MONGO_URI')
if mongo_uri:
return connect(host=mongo_uri)
else:
logging.error('Cannot find setting ANDAMAN_MONGO_URI, MongoDB connection is disabled')
def process_item(self, item, spider):
source = item['source']
title = item['title']
author = item.get('author', '')
start_time = item['start_time']
days = item['days']
destination = item['destination']
departure = item['departure']
people = item['people']
description = item['description']
author_avatar = item['author_avatar']
tid = item['tid']
comments = item['comments']
ops = {'set__startTime': start_time,
'set__source': source,
'set__author': author,
'set__title': title,
'set__days': days,
'set__destination': destination,
'set__departure': departure,
'set__groupSize': people,
'set__description': description,
'set__comments': comments,
'set__authorAvatar': author_avatar
}
JiebanDocument.objects(tid=tid).update_one(upsert=True, **ops)
return item
代理中间件以及爬虫代码,支持断点续爬。 settings.py
DOWNLOADER_MIDDLEWARES = {
'ctrip.middlewares.ProxyMiddleware': 543,
}
middlewares.py
import base64
class ProxyMiddleware(object):
def process_request(self, request, spider):
request.meta["proxy"] = 'http://username:password@proxy.com:9020'
#request.headers["Proxy-Authorization"] = proxyAuth
proxy_user_pass = b"username:password"
encoded_user_pass = base64.b64encode(proxy_user_pass)
request.headers['Proxy-Authorization'] = b'Basic ' + encoded_user_pass
spiders.xiecheng.py
# -*- coding: utf-8 -*-
import scrapy,os,re
from scrapy.http import Request
from bs4 import BeautifulSoup
class XiechengSpider(scrapy.Spider):
name = "xiecheng"
allowed_domains = ["ctrip.com"]
path = '/Users/lujianqiang/Development/xiecheng/'
def start_requests(self):
dir_set = set()
for line in open('/Users/lujianqiang/Development/ctrip/cx-city.sql','r'):
if '.html' in line:
dir_set.add(line.split(',')[3].split('/')[2].split('.')[0])
for dir_name in dir_set:
try:
os.mkdir(self.path+dir_name)
except FileExistsError:
pass
url = 'http://you.ctrip.com/travels/'+dir_name+'.html'
if not os.path.exists(self.path+dir_name+'/end.txt'):
if os.path.exists(self.path+dir_name+'/start.txt'):
for start_url in open(self.path+dir_name+'/start.txt','r'):
if not re.match('http://seccenter.ctrip.com/seccenter/main.aspx',start_url):
yield Request(start_url,callback=self.parse_list,meta={'dir_name':dir_name})
else:
yield Request(url,callback=self.parse_list,meta={'dir_name':dir_name})
else:
yield Request(url,callback=self.parse_list,meta={'dir_name':dir_name})
else:
print('++++++++++++++'+dir_name)
def parse_list(self, response):
soup = BeautifulSoup(response.body, 'html.parser')
if not soup.find_all("a", class_='nextpage disabled'):
with open(self.path+response.meta['dir_name']+'/start.txt','w') as f:
f.write(response.url)
f.close()
nextpage = soup.find_all("a", class_='nextpage')
if nextpage:
yield Request('http://you.ctrip.com/'+nextpage[0]['href'],callback=self.parse_list,meta={'dir_name':response.meta['dir_name']})
else:
print('=============='+response.meta['dir_name'])
with open(self.path+response.meta['dir_name']+'/end.txt','wb') as f:
f.write(bytes('end',encoding = "utf8"))
f.close()
items = soup.find_all("a", class_='journal-item cf')
for item in items:
post = {}
post['numview'] = item.ul.find_all('i',class_='numview')[0].get_text()
post['want'] = item.ul.find_all('i',class_='want')[0].get_text()
post['numreply'] = item.ul.find_all('i',class_='numreply')[0].get_text()
filename = item['href'].split('/')[3]
if not os.path.exists(self.path+response.meta['dir_name']+'/'+filename):
yield Request('http://you.ctrip.com/'+item['href'],callback=self.parse_article,meta={'dir_name':response.meta['dir_name'],'post':post})
else:
print(filename+'exist')
def parse_article(self, response):
#http://you.ctrip.com/travels/hangzhou14/2869877.html
filename = self.path+response.meta['dir_name']+'/'+response.url.split('/')[6]
numview = response.meta['post']['numview']
want = response.meta['post']['want']
numreply = response.meta['post']['numreply']
string = '<numview>{}</numview><want>{}</want><numreply>{}</numreply><url>{}</url>'.format(numview,want,numreply,response.url)
with open(filename,'wb') as f:
f.write(bytes(string,encoding = "utf8")+response.body)
parse_html
# -*- coding: utf-8 -*-
import os,re
from bs4 import BeautifulSoup
import pymongo
localdb = pymongo.MongoClient('mongodb://188.166.210.151',27017)['ctrip']
article_list = os.listdir()
for article_name in article_list:
if ".html" not in article_name:
continue
file = open(article_name,"r")
html = file.read()
if not html:
continue
soup = BeautifulSoup(html, 'html.parser')
post = {}
print(article_name)
post['title'] = soup.h2.get_text().strip()
if re.findall('发表于 (\d\d\d\d-\d\d-\d\d)',html):
post['date'] = re.findall('发表于 (\d\d\d\d-\d\d-\d\d)',html)[0]
else:
post['date'] = ''
post['author_name'] = soup.find_all('a',id='authorDisplayName')[0].get_text().strip()
post['author_url'] = 'http://you.ctrip.com' + soup.find_all('a',id='authorDisplayName')[0]['href']
days = soup.find_all('i',class_='days')
if days:
post['days'] = days[0].parent.get_text().split(':')[1].strip()
times = soup.find_all('i',class_='times')
if times:
post['times'] = times[0].parent.get_text().split(':')[1].strip()
costs = soup.find_all('i',class_='costs')
if costs:
post['costs'] = costs[0].parent.get_text().split(':')[1].strip()
whos = soup.find_all('i',class_='whos')
if whos:
post['whos'] = whos[0].parent.get_text().split(':')[1].strip()
gs_a_pois = soup.find_all('a',class_='gs_a_poi')
gs_a_poi_set = set()
for gs_a_poi in gs_a_pois:
gs_a_poi_set.add(gs_a_poi.get_text().strip('\n'))
print(gs_a_poi_set)
post['gs_a_poi'] = list(gs_a_poi_set)
post['url'] = soup.url.get_text()
post['numview'] = soup.numview.get_text()
post['want'] = soup.want.get_text()
post['numreply'] = soup.numreply.get_text()
print(post)
content = str(soup.find_all('div',class_='ctd_content')[0])
post['content'] = content
localdb.xiecheng.insert_one(post)