python scrapy爬取网站数据一
来源:风水月
发布时间:2018-07-03 17:24:48
阅读量:1405
原来写过一篇scrapy的介绍,说了下scrapy的环境如何配置,该篇博客地址是:win10 python安装及环境配置、scrapy框架安装及PyCharm集成
本篇会从一个实际的例子当中记录scrapy的使用
大家都对三国很熟,下面我们从 三国在线(http://www.e3ol.com/biography-index.html)来获取三国人物数据,获取三国人物数据的整体代码如下,本代码抓取数据的网址返回的是JSON格式的数据,本代码将解析该JSON数据,并将其按json的键创建数据表,保存人物信息
import scrapyimport jsonimport pymysqlimport refrom sgyyScrapy.items import SgyyscrapyItem
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}class sgyyScrapy(scrapy.Spider):
name = "sgyyScrapy"
allowed_domins = ["http://www.e3ol.com/"]
start_urls = []
isCreateTable = False
def start_requests(self):
global headers # 三国在线 通过主效势力去选 完整地址 http://www.e3ol.com/biography/inc_ajax.asp?types=index&a2=%s&pageno=1
urlhead = 'http://www.e3ol.com/biography/inc_ajax.asp?types=index&a2=%s'
for i in range(14):
baseUrl = urlhead % (i+1) + '&pageno=%s'
for qy in range(50):
url = baseUrl % (qy+1)
self.start_urls.append(url) for url in self.start_urls: # print (url)
yield scrapy.Request(url, headers=headers, callback=self.parse) def parse(self, response):
jsonStr = response.body_as_unicode() # 返回的数据是unicode编码,中文都被解析成\u4e2d\u6587(中文)这类的字符串了,所以通过下面的方法将其转换成中文
encodeStr = jsonStr.encode('utf-8').decode('unicode_escape')
encodeJsonStr = encodeStr[1:len(encodeStr)-1]
encodeJsonStr = encodeJsonStr.replace(" ","")
print(encodeJsonStr) # 返回的json的key没有‘’,会导致json.loads出错,在此给key添加''
reEncodeStr = self.quote_keys_for_json(encodeJsonStr)
print(reEncodeStr) # JSON转换成对象
jsonObject = json.loads(reEncodeStr.replace("'", "\"")) # 数据库连接
db = pymysql.connect(host = "127.0.0.1", port = 3306, user = "root",password = "zhl",database = "sgyy",charset='utf8')
cursor = db.cursor() for item in jsonObject['soul']:
joi = 0
jsonObjectNum = len(item)
createSQL = ""
insertSQL = ""
insertSQLValue = ""
if self.isCreateTable == False: for key in item:
joi = joi + 1
if joi >= jsonObjectNum:
createSQL = createSQL + key + " varchar(1000))"
insertSQL = insertSQL + key + ")"
insertSQLValue = insertSQLValue + "'%s')" % item[key] else: if joi == 1:
createSQL = "create table sgyy_person(" + key + " varchar(1000),"
insertSQL = "insert into sgyy_person(" + key + ","
insertSQLValue = insertSQLValue + " values ('%s'," % item[key] else:
createSQL = createSQL + key + " varchar(1000),"
insertSQL = insertSQL + "" +key +","
insertSQLValue = insertSQLValue + "'%s'," % item[key] try:
print(createSQL)
cursor.execute("DROP TABLE IF EXISTS sgyy_person")
cursor.execute(createSQL)
insertFinal = insertSQL+insertSQLValue
print(insertFinal)
cursor.execute(insertFinal)
db.commit() except:
print("发生错误,回滚事务")
db.rollback()
self.isCreateTable = True
else: for key in item:
joi = joi + 1
if joi >= jsonObjectNum:
insertSQL = insertSQL + key + ")"
insertSQLValue = insertSQLValue + "'%s')" % item[key] else: if joi == 1:
insertSQL = "insert into sgyy_person(" + key + ","
insertSQLValue = insertSQLValue + " values ('%s'," % item[key] else:
insertSQL = insertSQL + "" +key +","
insertSQLValue = insertSQLValue + "'%s'," % item[key] try:
insertFinal = insertSQL + insertSQLValue
print(insertFinal)
cursor.execute(insertFinal)
db.commit() except:
print("发生错误,回滚事务")
db.rollback()
cursor.close()
db.close()
print("结束") def quote_keys_for_json(self,json_str):
# """给键值不带双引号的json字符串的所有键值加上双引号。
# 注:解析一般的不严格的json串,可以checkout https://github.com/dmeranda/demjson, 速度比标准库要慢。"""
quote_pat = re.compile(r'".*?"')
a = quote_pat.findall(json_str)
json_str = quote_pat.sub('@', json_str)
key_pat = re.compile(r'(\w+):')
json_str = key_pat.sub(r'"\1":', json_str) assert json_str.count('@') == len(a)
count = -1
def put_back_values(match):
nonlocal count
count += 1
return a[count]
json_str = re.sub('@', put_back_values, json_str) return json_str
原文地址https://blog.csdn.net/fengshuiyue/article/details/80857875