几家旅游网站的爬虫

2015-12-25 Golmic 更多博文 » 博客 » GitHub »

爬虫 Python Scrapy

原文链接 http://code.lujq.me/2015/12/25/%E5%87%A0%E5%AE%B6%E6%97%85%E6%B8%B8%E7%BD%91%E7%AB%99%E7%9A%84%E7%88%AC%E8%99%AB/
注:以下为加速网络访问所做的原文缓存,经过重新格式化,可能存在格式方面的问题,或偶有遗漏信息,请以原文为准。


在Scrapy框架下 <!--more--> 蚂蜂窝

# coding=utf-8
import json
from urlparse import urljoin
import re
import logging
import scrapy
from scrapy.http import Request
from scrapy.selector import Selector

from andaman.utils.html import html2text, parse_time
from andaman.items.qa import QAItem
from andaman.items.jieban import JiebanItem

__author__ = 'zephyre'


class MafengwoQaSpider(scrapy.Spider):
    name = 'mafengwo-qa'

    def parse(self, response):
        html_text = json.loads(response.body)['payload']['list_html']
        for href in Selector(text=html_text).xpath(
                '//li/div[@class="wen"]//div[@class="title"]/a[@href]/@href').extract():
            url = urljoin(response.url, href)
            yield Request(url=url, callback=self.parse_question)

    def start_requests(self):
        for start_idx in xrange(0, 500, 20):
            yield Request(url='http://www.mafengwo.cn/qa/ajax_pager.php?action=question_index&start=%d' % start_idx)

    def parse_question(self, response):
        # 抓取相关问题
        for related_href in response.selector.xpath(
                '//div[@class="q-relate"]/ul[@class="bd"]/li/a[@href]/@href').extract():
            url = urljoin(response.url, related_href)
            yield Request(url=url, callback=self.parse_question)

        q_item = self.retrive_question(response)
        yield q_item

        # 抓取回答
        qid = q_item['qid']
        page = 0
        page_size = 50
        url = 'http://www.mafengwo.cn/qa/ajax_pager.php?qid=%d&action=question_detail&start=%d' \
              % (qid, page * page_size)
        yield Request(url=url, callback=self.parse_answer_list, meta={'qid': qid, 'page': page, 'page_size': page_size})

    def retrive_question(self, response):
        """
        分析response,得到问题
        """
        tmp = response.selector.xpath('//div[@class="q-detail"]/div[@class="person"]/div[@class="avatar"]/a[@href]')
        try:
            user_href = tmp[0].xpath('./@href').extract()[0]
        except IndexError:
            self.logger.warning('Invalid response: %s' % response.url)
            self.logger.warning(response.body)
            raise
        m = re.search(r'/wenda/u/(\d+)', user_href)
        author_id = int(m.group(1))
        tmp = tmp[0].xpath('./img/@src').extract()[0]
        author_avatar = re.sub(r'\.head\.w\d+\.', '.', tmp)
        if author_avatar.endswith('pp48.gif'):
            author_avatar = None
        author_name = response.selector.xpath(
            '//div[@class="q-content"]/div[@class="user-bar"]/a[@class="name"]/text()').extract()[0]

        title = response.selector.xpath('//div[@class="q-content"]/div[@class="q-title"]/h1/text()').extract()[0]

        raw_contents = \
            response.selector.xpath('//div[@class="q-content"]/div[@class="q-info"]/div[@class="q-desc"]').extract()[0]
        contents = html2text(raw_contents)

        tmp = response.selector.xpath(
            '//div[@class="q-content"]/div[@class="user-bar"]//span[@class="visit"]/text()').extract()[0]
        view_cnt = int(re.search(ur'(\d+)\s*浏览', tmp).group(1))

        time_str = response.selector.xpath(
            '//div[@class="q-content"]/div[@class="user-bar"]//span[@class="time"]/text()').extract()[0]
        timestamp = parse_time(time_str)

        tmp = response.selector.xpath(
            '//div[@class="q-content"]/div[@class="user-bar"]/span[@class="fr"]/a[@href]/text()').extract()
        if tmp and tmp[0].strip():
            topic = tmp[0].strip()
        else:
            topic = None

        raw_tags = response.selector.xpath(
            '//div[@class="q-content"]/div[@class="q-info"]/div[@class="q-tags"]/a[@class="a-tag"]/text()').extract()
        tags = [tmp.strip() for tmp in raw_tags if tmp.strip()]

        match = re.search(r'detail-(\d+)\.html', response.url)
        qid = int(match.group(1))

        item = QAItem()
        item['source'] = 'mafengwo'
        item['type'] = 'question'
        item['qid'] = qid
        item['title'] = title
        item['author_nickname'] = author_name
        item['author_id'] = author_id
        if author_avatar:
            item['author_avatar'] = author_avatar
            item['file_urls'] = [author_avatar]
        item['timestamp'] = timestamp
        if topic:
            item['topic'] = topic
        item['contents'] = contents
        item['tags'] = tags
        item['view_cnt'] = view_cnt

        return item

    def parse_answer_list(self, response):
        meta = response.meta
        qid = meta['qid']
        page = meta['page']
        page_size = meta['page_size']

        sel = Selector(text=json.loads(response.body)['payload']['list_html'])
        answer_nodes = sel.xpath('//li[contains(@class, "answer-item")]')
        if not answer_nodes:
            return

        # 查找下一页
        if len(answer_nodes) == page_size:
            next_page = page + 1
            url = 'http://www.mafengwo.cn/qa/ajax_pager.php?qid=%d&action=question_detail&start=%d' \
                  % (qid, next_page * page_size)
            yield Request(url=url, callback=self.parse_answer_list,
                          meta={'qid': qid, 'page': next_page, 'page_size': page_size})

        for answer_node in sel.xpath('//li[contains(@class, "answer-item") and @data-aid]'):
            aid = int(answer_node.xpath('./@data-aid').extract()[0])

            author_node = answer_node.xpath('./div[@class="person"]/div[contains(@class, "avatar") and @data-uid]')[0]
            author_id = int(author_node.xpath('./@data-uid').extract()[0])
            tmp = author_node.xpath('./a/img/@src').extract()[0]
            author_avatar = re.sub(r'\.head\.w\d+\.', '.', tmp)
            if author_avatar.endswith('pp48.gif'):
                author_avatar = None

            content_node = answer_node.xpath('./div[contains(@class,"answer-content")]')[0]

            author_name = content_node.xpath('./div[@class="user-bar"]/a[@class="name"]/text()').extract()[0]

            time_str = content_node.xpath('./div[@class="user-bar"]//span[@class="time"]/text()').extract()[0]
            timestamp = parse_time(time_str)

            accepted = bool(answer_node.xpath('.//div[contains(@class,"answer-best")]'))

            raw_contents = content_node.xpath('.//dl/dd[@class="_j_answer_html"]').extract()[0]
            contents = html2text(raw_contents)

            try:
                vote_cnt = int(answer_node.xpath('.//a[@class="btn-zan"]/span/text()').extract()[0])
            except (IndexError, ValueError):
                self.logger.debug(u'Invalid vote count: %s' % answer_node.extract()[0])
                vote_cnt = 0

            item = QAItem()
            item['type'] = 'answer'
            item['source'] = 'mafengwo'
            item['qid'] = qid
            item['aid'] = aid
            item['author_nickname'] = author_name
            item['author_id'] = author_id
            if author_avatar:
                item['author_avatar'] = author_avatar
                item['file_urls'] = [author_avatar]
            item['timestamp'] = timestamp
            item['contents'] = contents
            item['vote_cnt'] = vote_cnt
            item['accepted'] = accepted

            yield item


class MafengwoSpider(scrapy.Spider):
    name = "mafengwo-jieban"
    allowed_domains = ["mafengwo.cn"]

    def start_requests(self):
        total_page = self.crawler.settings.getint('MAFENGWO_JIEBAN_PAGES', 10)
        session_id = self.crawler.settings.get('MAFENGWO_SESSION_ID')
        cookies = {'PHPSESSID': session_id} if session_id else {}
        for i in range(total_page):
            url = 'http://www.mafengwo.cn/together/ajax.php?act=getTogetherMore&flag=3&offset=%d&mddid=0&timeFlag=1' \
                  '&timestart=' % i
            yield scrapy.Request(url, cookies=cookies)

    def parse(self, response):
        hrefs = scrapy.Selector(text=json.loads(response.body)['data']['html']).xpath('//li/a/@href').extract()
        for href in hrefs:
            url = 'http://www.mafengwo.cn/together/' + href
            yield scrapy.Request(url, callback=self.parse_dir_contents)

    def parse_dir_contents(self, response):
        tid = int(str(response.xpath('//script[1]/text()').re(r'"tid":\d+')[0])[6:])
        url = 'http://www.mafengwo.cn/together/ajax.php?act=moreComment&page=%d&tid=%d' % (0, tid)
        total = int(str(response.xpath('//script[1]/text()').re(r'"total":\d+')[0][8:])) / 10 + 1
        summary = response.xpath('//div[@class="summary"]')
        item = JiebanItem()
        item['source'] = 'mafengwo'
        item['title'] = response.xpath('//title/text()').extract()[0]

        item['start_time'] = summary.xpath('//div[@class="summary"]/ul/li[1]/span/text()').extract()[0].encode("UTF-8")[
                             15:]
        item['days'] = summary.xpath('//div[@class="summary"]/ul/li[2]/span/text()').extract()[0].encode("UTF-8")[9:]
        item['destination'] = summary.xpath('//div[@class="summary"]/ul/li[3]/span/text()').extract()[0].encode(
            "UTF-8")[12:].split("/")
        item['departure'] = summary.xpath('//div[@class="summary"]/ul/li[4]/span/text()').extract()[0].encode("UTF-8")[
                            12:]
        item['people'] = summary.xpath('//div[@class="summary"]/ul/li[5]/span/text()').extract()[0].encode("UTF-8")[15:]
        item['description'] = '\n'.join(filter(lambda v: v, [tmp.strip() for tmp in summary.xpath(
            '//div[@class="desc _j_description"]/text()').extract()])).encode("UTF-8")
        item['author_avatar'] = summary.xpath('//div[@class="sponsor clearfix"]/a/img/@src').extract()[0].encode(
            "UTF-8")
        item['comments'] = []
        item['tid'] = tid
        yield scrapy.Request(url,
                             meta={'item': item, 'page': 0, 'total': total, 'tid': tid}, callback=self.parse_comments)

    def parse_comments(self, response):
        item = response.meta['item']
        page = response.meta['page'] + 1
        body = scrapy.Selector(text=json.loads(response.body)['data']['html'])
        if body.extract() != '<html></html>':
            for node in body.xpath('//div[@class="vc_comment"]'):
                try:
                    author_avatar = node.xpath('.//div[@class= "avatar"]/a/img/@src').extract()[0].encode("UTF-8")
                    author = node.xpath('.//a[@class="comm_name"]/text()').extract()[0].encode("UTF-8")
                    cid = int(node.xpath('.//div[@class="comm_reply"]/a/@data-cid').extract()[0].encode("UTF-8"))
                    comment = '\n'.join(
                        filter(lambda v: v, [tmp.strip() for tmp in node.xpath('.//p/text()').extract()])).encode(
                        "UTF-8")
                    comment_item = {'cid': cid, 'author_avatar': author_avatar, 'author': author, 'comment': comment}
                    item['comments'].append(comment_item)
                except IndexError:
                    self.logger.warning('Unable to extract comment from: %s' % (node.extract()))
        if page <= response.meta['total']:
            url = 'http://www.mafengwo.cn/together/ajax.php?act=moreComment&page=%d&tid=%d' % (page, item['tid'])
            yield scrapy.Request(url, meta={'item': item, 'page': page, 'total': response.meta['total']},
                                 callback=self.parse_comments)
        else:
            yield item


pintour


# coding=utf-8

import json
from urlparse import urljoin
import re
import logging
import scrapy
from scrapy.http import Request
from scrapy.selector import Selector

from andaman.utils.html import html2text, parse_time
from andaman.items.jieban import JiebanItem


class PintourSpider(scrapy.Spider):
    name = 'pintour'
    allowed_domains = ['pintour.com']

    def start_requests(self):
        total_page = self.crawler.settings.getint('MAFENGWO_JIEBAN_PAGES', 10)
        for i in range(1, total_page):
            url = 'http://www.pintour.com/list/0-0-0-0-2-1-s-0_%d' % i
            yield scrapy.Request(url)

    def parse(self, response):
        metalist = Selector(text=response.body).xpath('//ul[@class="mateList"]/li/div/h3/a/@href').extract()
        for href in metalist:
            tid = int(href[1:])
            url = 'http://www.pintour.com/%d' % tid
            yield Request(url, callback=self.parse_dir_contents)

    def parse_dir_contents(self, response):
        item = JiebanItem()
        item['source'] = 'pintour'
        item['tid'] = int(response.url.split('/')[3])
        item['title'] = response.xpath('//title/text()').extract()[0]
        data = response.xpath('//div[@class="colBox clearfix"]')[0]
        item['author'] = data.xpath('//div[@class="colBoxL clearfix"]/dl/dt/a/text()').extract()[0]
        item['author_avatar'] = data.xpath('//div[@class="colBoxL clearfix"]/a/img/@src').extract()[0]
        item['type'] = data.xpath('//div[@class="colBoxR"]/div//a/span/text()').extract()
        time = data.xpath('.//div[@class="timePlace clearfix"]/p/text()').extract()[0]
        item['start_time'] = time
        item['departure'] = data.xpath('.//div[@class="timePlace clearfix"]/p[@class="plrCon"]/a/text()').extract()[0]
        item['destination'] = data.xpath('.//div[@class="timePlace clearfix"]/p[@class="plrCon"]/a/text()').extract()
        del item['destination'][0]
        item['description'] = ' '.join(
            filter(lambda v: v, [tmp.strip() for tmp in data.xpath('//div[@class="colBoxB"]//text()').extract()]))
        item['comments'] = []

        if re.search(r'\d+条回应', response.body):
            reply_num = int(re.search(r'\d+条回应', response.body).group(0)[:-9])
            total = reply_num / 20 + 1
            url = 'http://www.pintour.com/%d_1' % item['tid']
            yield Request(url,
                          meta={'item': item, 'page': 1, 'total': total, 'tid': item['tid']}, callback=self.parse_comments)

    def parse_comments(self, response):
        item = response.meta['item']
        page = response.meta['page'] + 1
        for node in response.xpath('//ul[@class="reply"]/li'):
            author = node.xpath('.//div/input/@value').extract()[0]
            author_avatar = node.xpath('.//a/img/@src').extract()[0]
            comment = node.xpath('.//div/input/@value').extract()[2]
            cid = int(node.xpath('.//div/@class').extract()[0].encode('UTF-8')[10:])
            comment_item = {'cid': cid, 'author_avatar': author_avatar, 'author': author, 'comment': comment}
            item['comments'].append(comment_item)

        if page <= response.meta['total']:
            url = 'http://www.pintour.com/%d_%d' % (item['tid'], page)
            yield Request(url, meta={'item': item, 'page': page, 'total': response.meta['total']},
                          callback=self.parse_comments)
        else:
            yield item

ctrip

# coding=utf-8
import json
from urlparse import urljoin
import re
import logging
import scrapy
from scrapy.http import Request
from scrapy.http import FormRequest
from scrapy.selector import Selector

from andaman.utils.html import html2text, parse_time
from andaman.items.jieban import JiebanItem


class CtripSpider(scrapy.Spider):
    name = 'ctrip'

    def start_requests(self):
        start_urls = [
            'http://vacations.ctrip.com/tours',
            'http://vacations.ctrip.com/tours/inter'
        ]
        for url in start_urls:
            yield Request(url)

    def parse(self, response):

        # 爬取城市列表
        for city in response.xpath('//div[@class="sel_list"]/dl/dd/a/@href').extract():
            num = int(re.search(r'\d+', str(city)).group(0))
            url = 'http://you.ctrip.com/DangdiSite/events/%d.html' % num
            yield Request(url, callback=self.parse_city)

    def parse_city(self, response):

        #爬取每个城市对应的页面的文章列表
        for href in response.xpath('//ul[@class="cf"]/li/a/@href').extract():
            url = urljoin(response.url, href)
            yield Request(url, callback=self.parse_article)

    def parse_article(self, response):
        item = JiebanItem()
        item['title'] = response.xpath('//title/text()').extract()[0]
        item['tid'] = int(response.url.split('/')[5].split('.')[0])
        if response.xpath('//div[@class="gsn-inputbox"]/input[@id="receiver_id"]/../input[@type="text"]/@value').extract():
            item['author'] = response.xpath('//div[@class="gsn-inputbox"]/input[@id="receiver_id"]/../input[@type="text"]/@value').extract()[0]
        else:
            item['author'] = ''
        eventsummaryinfoview = response.xpath('//div[@id="eventsummaryinfoview"]')
        if eventsummaryinfoview.xpath('./p/span[@class="littlepadding"]/text()').extract():
            item['start_time'] = eventsummaryinfoview.xpath('./p/span[@class="littlepadding"]/text()').extract()[0]
        else:
            item['start_time'] = ''
        if eventsummaryinfoview.xpath('//p[@class="events_time"]/text()').extract():
            item['days'] = eventsummaryinfoview.xpath('//p[@class="events_time"]/text()').extract()[2]
        else:
            item['days'] = ''
        if eventsummaryinfoview.xpath('//p[@class="events_place"]/text()').extract():
            item['departure'] = eventsummaryinfoview.xpath('//p[@class="events_place"]/text()').extract()[1]
        else:
            item['departure'] = ''
        if eventsummaryinfoview.xpath('//p[@class="events_place"]/text()').extract():
            item['destination'] = eventsummaryinfoview.xpath('//p[@class="events_place"]/text()').extract()[2]
        else:
            item['destination'] = ''
        if eventsummaryinfoview.xpath('//p[@class="events_tag"]/a/span/text()').extract():
            item['type'] = eventsummaryinfoview.xpath('//p[@class="events_tag"]/a/span/text()').extract()[0]
        else:
            item['type'] = ''
        if response.xpath('//div[@class="events_infotext"]/p/text()').extract():
            item['description'] = ' '.join(filter(lambda v: v, [tmp.strip() for tmp in response.xpath('//div[@class="events_infotext"]/p/text()').extract()]))
        else:
            item['description'] = ''
        item['comments'] = []
        frmdata = {"page": "1", "eventId": str(item['tid'])}
        url = 'http://you.ctrip.com/CommunitySite/Activity/EventDetail/EventReplyListOrCommentList'
        yield FormRequest(url, formdata=frmdata, method='POST',
                          meta={'item': item, 'page': 0}, callback=self.parse_comments)

    def parse_comments(self, response):
        logging.info(response.body)

        pass


items


# coding=utf-8
import scrapy


class JiebanItem(scrapy.Item):
    # 数据来源
    source = scrapy.Field()

    #标题
    title = scrapy.Field()

    #出发时间
    start_time = scrapy.Field()

    #天数
    days = scrapy.Field()

    #出发地
    destination = scrapy.Field()

    #目的地
    departure = scrapy.Field()

    #预订人数
    people= scrapy.Field()

    #文章描述
    description = scrapy.Field()

    #作者头像URL
    author_avatar = scrapy.Field()

    #评论
    comments = scrapy.Field()

    #文章id
    tid = scrapy.Field()

    #旅行方式
    type = scrapy.Field()

    #文章作者
    author = scrapy.Field()



pipelines

# coding=utf-8
from datetime import datetime

from mongoengine import Document, EmbeddedDocument, EmbeddedDocumentField, StringField, IntField, ListField, connect
import logging

__author__ = 'golmic'


class Comments(EmbeddedDocument):
    # 评论内容
    comment = StringField()

    # 评论作者
    author = StringField()

    # 作者头像Url
    author_avatar = StringField()

    # 评论id
    cid = IntField()


class JiebanDocument(Document):
    # 数据来源
    source = StringField()

    #文章标题
    title = StringField()

    # 出发时间
    startTime = StringField()

    # 预计天数
    days = StringField()

    # 目的地
    destination = ListField()

    # 出发地
    departure = StringField()

    # 预计人数
    groupSize = StringField()

    # 文章描述
    description = StringField()

    # 作者头像Url
    authorAvatar = StringField()

    # 文章id
    tid = IntField()

    # 文章评论
    comments = ListField(EmbeddedDocumentField(Comments))

    #作者
    author = StringField()

    #旅行方式
    type = StringField()


class JiebanPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        if not crawler.settings.getbool('PIPELINE_JIEBAN_ENABLED', False):
            from scrapy.exceptions import NotConfigured
            raise NotConfigured
        return cls(crawler.settings)

    def __init__(self, settings):
        self._conn = {}
        self.init_db(settings)

    @staticmethod
    def init_db(settings):
        mongo_uri = settings.get('ANDAMAN_MONGO_URI')
        if mongo_uri:
            return connect(host=mongo_uri)
        else:
            logging.error('Cannot find setting ANDAMAN_MONGO_URI, MongoDB connection is disabled')

    def process_item(self, item, spider):
        source = item['source']
        title = item['title']
        author = item.get('author', '')
        start_time = item['start_time']
        days = item['days']
        destination = item['destination']
        departure = item['departure']
        people = item['people']
        description = item['description']
        author_avatar = item['author_avatar']
        tid = item['tid']
        comments = item['comments']
        ops = {'set__startTime': start_time,
               'set__source': source,
               'set__author': author,
               'set__title': title,
               'set__days': days,
               'set__destination': destination,
               'set__departure': departure,
               'set__groupSize': people,
               'set__description': description,
               'set__comments': comments,
               'set__authorAvatar': author_avatar
            }
        JiebanDocument.objects(tid=tid).update_one(upsert=True, **ops)
        return item


代理中间件以及爬虫代码,支持断点续爬。 settings.py

DOWNLOADER_MIDDLEWARES = {
  'ctrip.middlewares.ProxyMiddleware': 543,
}

middlewares.py

import base64
class ProxyMiddleware(object):
    def process_request(self, request, spider):
        request.meta["proxy"] = 'http://username:password@proxy.com:9020'
        #request.headers["Proxy-Authorization"] = proxyAuth
        proxy_user_pass = b"username:password"
        encoded_user_pass = base64.b64encode(proxy_user_pass)
        request.headers['Proxy-Authorization'] = b'Basic ' + encoded_user_pass

spiders.xiecheng.py

# -*- coding: utf-8 -*-
import scrapy,os,re
from scrapy.http import Request
from bs4 import BeautifulSoup

class XiechengSpider(scrapy.Spider):
    name = "xiecheng"
    allowed_domains = ["ctrip.com"]
    path = '/Users/lujianqiang/Development/xiecheng/'

    def start_requests(self):
        dir_set = set()
        for line in open('/Users/lujianqiang/Development/ctrip/cx-city.sql','r'):
            if '.html' in line:
                dir_set.add(line.split(',')[3].split('/')[2].split('.')[0])
        for dir_name in dir_set:
            try:
                os.mkdir(self.path+dir_name)
            except FileExistsError:
                pass
            url = 'http://you.ctrip.com/travels/'+dir_name+'.html'
            if not os.path.exists(self.path+dir_name+'/end.txt'):
                if os.path.exists(self.path+dir_name+'/start.txt'):
                    for start_url in open(self.path+dir_name+'/start.txt','r'):
                        if not re.match('http://seccenter.ctrip.com/seccenter/main.aspx',start_url):
                            yield Request(start_url,callback=self.parse_list,meta={'dir_name':dir_name})
                        else:
                            yield Request(url,callback=self.parse_list,meta={'dir_name':dir_name})
                else:
                    yield Request(url,callback=self.parse_list,meta={'dir_name':dir_name})
            else:
                print('++++++++++++++'+dir_name)
    def parse_list(self, response):
        soup = BeautifulSoup(response.body, 'html.parser')
        if not soup.find_all("a", class_='nextpage disabled'):
            with open(self.path+response.meta['dir_name']+'/start.txt','w') as f:
                f.write(response.url)
                f.close()
            nextpage = soup.find_all("a", class_='nextpage')
            if nextpage:
                yield Request('http://you.ctrip.com/'+nextpage[0]['href'],callback=self.parse_list,meta={'dir_name':response.meta['dir_name']})
        else:
            print('=============='+response.meta['dir_name'])
            with open(self.path+response.meta['dir_name']+'/end.txt','wb') as f:
                f.write(bytes('end',encoding = "utf8"))
                f.close()
        items = soup.find_all("a", class_='journal-item cf')
        for item in items:
            post = {}
            post['numview'] = item.ul.find_all('i',class_='numview')[0].get_text()
            post['want'] = item.ul.find_all('i',class_='want')[0].get_text()
            post['numreply'] = item.ul.find_all('i',class_='numreply')[0].get_text()
            filename = item['href'].split('/')[3]
            if not os.path.exists(self.path+response.meta['dir_name']+'/'+filename):
                yield Request('http://you.ctrip.com/'+item['href'],callback=self.parse_article,meta={'dir_name':response.meta['dir_name'],'post':post})
            else:
                print(filename+'exist')

    def parse_article(self, response):
        #http://you.ctrip.com/travels/hangzhou14/2869877.html
        filename = self.path+response.meta['dir_name']+'/'+response.url.split('/')[6]
        numview = response.meta['post']['numview']
        want = response.meta['post']['want']
        numreply = response.meta['post']['numreply']
        string = '<numview>{}</numview><want>{}</want><numreply>{}</numreply><url>{}</url>'.format(numview,want,numreply,response.url)
        with open(filename,'wb') as f:
            f.write(bytes(string,encoding = "utf8")+response.body)

parse_html

# -*- coding: utf-8 -*-
import os,re
from bs4 import BeautifulSoup
import pymongo

localdb = pymongo.MongoClient('mongodb://188.166.210.151',27017)['ctrip']
article_list = os.listdir()
for article_name in article_list:
  if ".html" not in article_name:
    continue
  file = open(article_name,"r")
  html = file.read()
  if not html:
    continue
  soup = BeautifulSoup(html, 'html.parser')
  post = {}
  print(article_name)
  post['title'] = soup.h2.get_text().strip()

  if re.findall('发表于 (\d\d\d\d-\d\d-\d\d)',html):
   post['date'] = re.findall('发表于 (\d\d\d\d-\d\d-\d\d)',html)[0]
  else:
    post['date'] = ''

  post['author_name'] = soup.find_all('a',id='authorDisplayName')[0].get_text().strip()

  post['author_url'] = 'http://you.ctrip.com' + soup.find_all('a',id='authorDisplayName')[0]['href']

  days = soup.find_all('i',class_='days')
  if days:
    post['days'] = days[0].parent.get_text().split(':')[1].strip()
  times = soup.find_all('i',class_='times')
  if times:
    post['times'] = times[0].parent.get_text().split(':')[1].strip()
  costs = soup.find_all('i',class_='costs')
  if costs:
    post['costs'] = costs[0].parent.get_text().split(':')[1].strip()
  whos = soup.find_all('i',class_='whos')
  if whos:
    post['whos'] = whos[0].parent.get_text().split(':')[1].strip()

  gs_a_pois = soup.find_all('a',class_='gs_a_poi')
  gs_a_poi_set = set()
  for gs_a_poi in gs_a_pois:
    gs_a_poi_set.add(gs_a_poi.get_text().strip('\n'))
  print(gs_a_poi_set)
  post['gs_a_poi'] = list(gs_a_poi_set)
  post['url'] = soup.url.get_text()
  post['numview'] = soup.numview.get_text()
  post['want'] = soup.want.get_text()
  post['numreply'] = soup.numreply.get_text()
  print(post)
  content = str(soup.find_all('div',class_='ctd_content')[0])
  post['content'] = content
  localdb.xiecheng.insert_one(post)