Scrapy 实现MySQLdb Pipline存储数据

这里使用的是MySQLdb-Python,用着还行,嘿嘿

代码看这里的

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from scrapy.contrib.exporter import XmlItemExporter,JsonItemExporter,JsonLinesItemExporter,CsvItemExporter
from scrapy import signals,log
from scrapy.exceptions import DropItem
import datetime,pymongo
import MySQLdb

class MysqlDBPipline(object):
def __init__(self,mysql_host,mysql_db,mysql_user,mysql_passwd):
self.mysql_host = mysql_host
self.mysql_db = mysql_db
self.mysql_user = mysql_user
self.mysql_passwd = mysql_passwd

@classmethod
def from_crawler(cls,crawler):
return cls(
mysql_host=crawler.settings.get('MYSQL_HOST'),
mysql_user=crawler.settings.get('MYSQL_USER'),
mysql_passwd=crawler.settings.get('MYSQL_PASSWD'),
mysql_db=crawler.settings.get('MYSQL_DB')
)

def open_spider(self,spider):
try:
self.conn = MySQLdb.connect(
user=self.mysql_user,
passwd=self.mysql_passwd,
db=self.mysql_db,
host=self.mysql_host,
charset="utf8",
use_unicode=True
)
self.cursor = self.conn.cursor()
except MySQLdb.Error,e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])

def close_spider(self,spider):
self.cursor.close()
self.conn.close()

def process_item(self,item,spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))

if valid:
for n in range(len(item['title'])):
try:
value = [item['title'][n],item['url'][n],item['create_time']]
self.cursor.execute('INSERT INTO itunes (title,url,create_time) VALUES (%s,%s,%s)',value)
self.conn.commit()
except MySQLdb.Error,e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])

log.msg("Question added to MySQLdb database!", level=log.DEBUG, spider=spider)

return item

配置文件加入

1
2
3
ITEM_PIPELINES = {
'apple.pipelines.MysqlDBPipline':100
}