1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
| from scrapy.contrib.exporter import XmlItemExporter,JsonItemExporter,JsonLinesItemExporter,CsvItemExporter from scrapy import signals,log from scrapy.exceptions import DropItem import datetime,pymongo import MySQLdb class MongoDBPipeline(object): def __init__(self,mongo_server,mongo_port,mongo_db,mongo_collection): self.mongo_server = mongo_server self.mongo_port = mongo_port self.mongo_db = mongo_db self.mongo_collection = mongo_collection @classmethod def from_crawler(cls,crawler): return cls( mongo_server=crawler.settings.get('MONGO_SERVER'), mongo_port=crawler.settings.get('MONGO_PORT'), mongo_db=crawler.settings.get('MONGO_DB'), mongo_collection=crawler.settings.get('MONGO_COLLECTION') ) def open_spider(self,spider): self.client = pymongo.MongoClient(self.mongo_server,self.mongo_port) self.db = self.client[self.mongo_db] self.collection = self.mongo_collection def close_spider(self,spider): self.client.close() def process_item(self,item,spider): valid = True for data in item: if not data: valid = False raise DropItem("Missing {0}!".format(data)) if valid: for n in range(len(item['title'])): self.db[self.collection].insert( { 'url':item['url'][n], 'title':item['title'][n], 'create_time':item['create_time'] } ) log.msg("Question added to MongoDB database!", level=log.DEBUG, spider=spider) return item
|