如何在scrapy框架下用python爬取json文件

2025-03-21 04:21:25
推荐回答(2个)
回答1:

生成Request的时候与一般的网页是相同的,提交Request后scrapy就会下载相应的网页生成Response,这时只用解析response.body按照解析json的方法就可以提取数据了。代码示例如下(以京东为例,其中的parse_phone_price和parse_commnets是通过json提取的,省略部分代码):

# -*- coding: utf-8 -*-
 
from scrapy.spiders import Spider, CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from jdcom.items import JdPhoneCommentItem, JdPhoneItem
from scrapy import Request
from datetime import datetime
import json
import logging
import re
 
logger = logging.getLogger(__name__)
 
 
class JdPhoneSpider(CrawlSpider):
    name = "jdPhoneSpider"
    start_urls = ["http://list.jd.com/list.html?cat=9987,653,655"]
 
    rules = (
        Rule(
            LinkExtractor(allow=r"list\.html\?cat\=9987,653,655\&page\=\d+\&trans\=1\&JL\=6_0_0"),
            callback="parse_phone_url",
            follow=True,
        ),
    )
 
    def parse_phone_url(self, response):
        hrefs = response.xpath("//div[@id='plist']/ul/li/div/div[@class='p-name']/a/@href").extract()
        phoneIDs = []
        for href in hrefs:
            phoneID = href[14:-5]
            phoneIDs.append(phoneID)
            commentsUrl = "http://sclub.jd.com/productpage/p-%s-s-0-t-3-p-0.html" % phoneID
            yield Request(commentsUrl, callback=self.parse_commnets)
 
    def parse_phone_price(self, response):
        phoneID = response.meta['phoneID']
        meta = response.meta
        priceStr = response.body.decode("gbk", "ignore")
        priceJson = json.loads(priceStr)
        price = float(priceJson[0]["p"])
        meta['price'] = price
        phoneUrl = "http://item.jd.com/%s.html" % phoneID
        yield Request(phoneUrl, callback=self.parse_phone_info, meta=meta)
 
    def parse_phone_info(self, response):  
        pass
 
    def parse_commnets(self, response):
 
        commentsItem = JdPhoneCommentItem()
        commentsStr = response.body.decode("gbk", "ignore")
        commentsJson = json.loads(commentsStr)
        comments = commentsJson['comments']
 
        for comment in comments:
            commentsItem['commentId'] = comment['id']
            commentsItem['guid'] = comment['guid']
            commentsItem['content'] = comment['content']
            commentsItem['referenceId'] = comment['referenceId']
            # 2016-09-19 13:52:49  %Y-%m-%d %H:%M:%S
            datetime.strptime(comment['referenceTime'], "%Y-%m-%d %H:%M:%S")
            commentsItem['referenceTime'] = datetime.strptime(comment['referenceTime'], "%Y-%m-%d %H:%M:%S")
 
            commentsItem['referenceName'] = comment['referenceName']
            commentsItem['userProvince'] = comment['userProvince']
            # commentsItem['userRegisterTime'] = datetime.strptime(comment['userRegisterTime'], "%Y-%m-%d %H:%M:%S")
            commentsItem['userRegisterTime'] = comment.get('userRegisterTime')
            commentsItem['nickname'] = comment['nickname']
            commentsItem['userLevelName'] = comment['userLevelName']
            commentsItem['userClientShow'] = comment['userClientShow']
            commentsItem['productColor'] = comment['productColor']
            # commentsItem['productSize'] = comment['productSize']
            commentsItem['productSize'] = comment.get("productSize")
            commentsItem['afterDays'] = int(comment['days'])
            images = comment.get("images")
            images_urls = ""
            if images:
                for image in images:
                    images_urls = image["imgUrl"] + ";"
            commentsItem['imagesUrl'] = images_urls
        yield commentsItem
 
        commentCount = commentsJson["productCommentSummary"]["commentCount"]
        goodCommentsCount = commentsJson["productCommentSummary"]["goodCount"]
        goodCommentsRate = commentsJson["productCommentSummary"]["goodRate"]
        generalCommentsCount = commentsJson["productCommentSummary"]["generalCount"]
        generalCommentsRate = commentsJson["productCommentSummary"]["generalRate"]
        poorCommentsCount = commentsJson["productCommentSummary"]["poorCount"]
        poorCommentsRate = commentsJson["productCommentSummary"]["poorRate"]
        phoneID = commentsJson["productCommentSummary"]["productId"]
 
        priceUrl = "http://p.3.cn/prices/mgets?skuIds=J_%s" % phoneID
        meta = {
            "phoneID": phoneID,
            "commentCount": commentCount,
            "goodCommentsCount": goodCommentsCount,
            "goodCommentsRate": goodCommentsRate,
            "generalCommentsCount": generalCommentsCount,
            "generalCommentsRate": generalCommentsRate,
            "poorCommentsCount": poorCommentsCount,
            "poorCommentsRate": poorCommentsRate,
        }
        yield Request(priceUrl, callback=self.parse_phone_price, meta=meta)
 
        pageNum = commentCount / 10 + 1
        for i in range(pageNum):
            commentsUrl = "http://sclub.jd.com/productpage/p-%s-s-0-t-3-p-%d.html" % (phoneID, i)
            yield Request(commentsUrl, callback=self.parse_commnets)

回答2:

import json
str = str[(str.find('(')+1):str.rfind(')')] #去掉首尾的圆括号前后部分
dict = json.loads(str)
comments = dict['comments']
#然后for一下就行了