Scrapy - ผลลัพธ์สุดท้ายเท่านั้น

ฉันเกือบจะทำให้โปรแกรมที่กระท่อนกระแท่นนี้หยุดทำงานแล้ว ยกเว้นปัญหาสุดท้ายนี้ ฉันกำลังพยายามที่จะ

วนซ้ำรายการในแต่ละรายการหลายรายการบนเพจ
แยกข้อมูลบางส่วนในหน้ารายการแรกนี้สำหรับแต่ละรายการ ['RStation']
ป้อน URL ของแต่ละรายการผ่านทาง href
แยกข้อมูลบางส่วนโดยการวนซ้ำรายการในหน้าถัดไปนี้
สร้างรายการเดียวด้วยข้อมูลจากหน้าหลักและหน้าถัดไป

ปัญหาคือเมื่อฉันเปิด csv ฉันจะเห็นเฉพาะรายการซ้ำของรายการสุดท้ายในรายการวนซ้ำที่ 2 (สำหรับแต่ละรายการของรายการแรก)

ฉันกำลังต่อท้ายรายการอย่างไม่ถูกต้องหรือใช้ response.meta ในทางที่ผิดหรือไม่? ฉันพยายามปฏิบัติตามเอกสารประกอบสำหรับ response.meta และฉันไม่เข้าใจว่าทำไมสิ่งนี้ถึงไม่ทำงาน

ความช่วยเหลือใด ๆ ที่จะได้รับการชื่นชมอย่างมาก

import scrapy
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from fspeople.items import FspeopleItem

class FSSpider(scrapy.Spider):
name = "fspeople"
allowed_domains = ["fs.fed.us"]
start_urls = [
    "http://www.fs.fed.us/research/people/people_search_results.php?3employeename=&keywords=&station_id=SRS&state_id=ALL",
    #"http://www.fs.fed.us/research/people/people_search_results.php?employeename=&keywords=&station_id=RMRS&state_id=ALL",
    #"http://www.fs.fed.us/research/people/people_search_results.php?employeename=&keywords=&station_id=PSW&state_id=ALL",
    #"http://www.fs.fed.us/research/people/people_search_results.php?employeename=&keywords=&station_id=PNW&state_id=ALL",
    #"http://www.fs.fed.us/research/people/people_search_results.php?employeename=&keywords=&station_id=NRS&state_id=ALL",
    #"http://www.fs.fed.us/research/people/people_search_results.php?employeename=&keywords=&station_id=IITF&state_id=ALL",
    #"http://www.fs.fed.us/research/people/people_search_results.php?employeename=&keywords=&station_id=FPL&state_id=ALL",
    #"http://www.fs.fed.us/research/people/people_search_results.php?employeename=&keywords=&station_id=WO&state_id=ALL"
]
def __init__(self):
    self.i = 0

def parse(self,response):
    for sel in response.xpath("//a[@title='Click to view their profile ...']/@href"):
        item = FspeopleItem()
        url = response.urljoin(sel.extract())
        item['RStation'] = response.xpath("//table[@id='table_id']/tbody/tr/td[2]/i/b/text() | //table[@id='table_id']/tbody/td[2]/text()").extract_first().strip()
        request = scrapy.Request(url, callback=self.parse_post)
        request.meta['item'] = item
        yield request
    self.i += 1

def parse_post(self, response):
    theitems = []
    pubs = response.xpath("//div/h2[text()='Featured Publications & Products']/following-sibling::ul[1]/li | //div/h2[text()='Publications']/following-sibling::ul[1]/li")
    for i in pubs:
        item = response.meta['item']
        name = response.xpath("//div[@id='maincol']/h1/text() | //nobr/text()").extract_first().strip()
        pubname = i.xpath("a/text()").extract_first().strip()
        pubauth = i.xpath("text()").extract_first().strip()
        pubURL = i.xpath("a/@href").extract_first().strip()
        #RStation = response.xpath("//div[@id='right-float']/div/div/ul/li/a/text()").extract_first().strip()

        item['link'] = response.url
        item['name'] = name
        item['pubname'] = pubname
        item['pubauth'] = pubauth
        item['pubURL'] = pubURL
        #item['RStation'] = RStation

        theitems.append(item)
    return theitems

python scrapy

Chris 23.04.2016 แหล่งที่มา

comment

คุณกำลังเอาชนะ __init__ แต่คุณไม่ได้เรียก super สำหรับ scrapy.Spider - Steven Almeroth 25.04.2016

comment

คุณกำลังวนซ้ำรายการเดียวกันในวงของคุณ ลอง `item = response.meta.get ('item') - Steven Almeroth 25.04.2016

คำตอบ (1)

arrow_upward
0
arrow_downward

สร้างอินสแตนซ์ใหม่ของรายการสำหรับการวนซ้ำแต่ละครั้ง

def parse_post(self, response):
    [...]
    for i in pubs:
        item = response.meta['item']
        item = item.copy()
        [...]

Frederic Bazin 25.04.2016

Scrapy - ผลลัพธ์สุดท้ายเท่านั้น

คำตอบ (1)

คำถามในหัวข้อ