ไม่สามารถขูดเว็บด้วยหลายตารางด้วย python lxml

ฉันกำลังพยายามขูดเว็บนี้ แต่ฉันไม่ได้รับผลลัพธ์ใดๆ ซึ่งใช้ได้กับหน้าอื่นๆ ที่มีตารางง่ายๆ เพียงตารางเดียว คุณช่วยฉันเรื่องรหัสได้ไหม

import lxml
from lxml import html
import requests
import numpy as np
import pandas as pd
import urllib

def scrape_table(url):
    # Fetch the page that we're going to parse
    page = requests.get(url)

    tree = html.fromstring(page.content)
    # Using XPATH, fetch all table elements on the page
    #df = tree.xpath('//div[@id="main content"]/div[@id="style-1"]/table[@class="table"]/tbody')
    df = tree.xpath('//tr')
    #assert len(table) == 1

    #df = pd.read_html(lxml.etree.tostring(table[0], method='html'))[0]


    return df

symbol = 'AMZN'
#balance_sheet_url = 'https://finance.yahoo.com/quote/' + symbol + '?p=' + symbol
#df_balance_sheet = scrape_table(balance_sheet_url)
#df_balance_sheet.info()
#print(df_balance_sheet)
url = "https://www.macrotrends.net/stocks/charts/"+ symbol + "/pe-ratio"
data = requests.request("GET", url)
url_completo = data.url
print(url_completo)
df_pe = scrape_table(url_completo)

นี่คือเว็บที่ฉันพยายามขูด (โค้ด) เว็บ:https://www.macrotrends.net/stocks/charts/TMO/thermo-fisher-scientific/pe-ratio

<div id="style-1" style="background-color:#fff; height: 500px; overflow:auto; margin: 0px 0px 30px 0px; padding:0px 30px 20px 0px; border:1px solid #dfdfdf;">

                <table class="table">
                <thead>
                  <tr>
                    <th colspan="4" style="text-align:center;">Thermo Fisher Scientific PE Ratio Historical Data</th>
                  </tr>
                </thead>
                <thead>
                  <tr>
                    <th style="text-align:center;">Date</th>
                    <th style="text-align:center;">Stock Price</th>
                    <th style="text-align:center;">TTM Net EPS</th>
                    <th style="text-align:center;">PE Ratio</th>
                  </tr>
                </thead>
                <tbody><tr>
                    <td style="text-align:center;">2019-04-12</td>
                    <td style="text-align:center;">280.65</td>
                    <td style="text-align:center;"></td>
                    <td style="text-align:center;">38.71</td>
                 </tr><tr>
                    <td style="text-align:center;">2018-12-31</td>
                    <td style="text-align:center;">223.79</td>
                    <td style="text-align:center;">$7.25</td>
                    <td style="text-align:center;">30.87</td>
                 </tr><tr>
                    <td style="text-align:center;">2018-09-30</td>
                    <td style="text-align:center;">243.90</td>
                    <td style="text-align:center;">$6.33</td>
                    <td style="text-align:center;">38.53</td>
                 </tr><tr>
                    <td style="text-align:center;">2018-06-30</td>
                    <td style="text-align:center;">206.84</td>
                    <td style="text-align:center;">$5.92</td>
                    <td style="text-align:center;">34.94</td>
                 </tr>
              </table>          

            </div>```

python lxml

amigosalvaro 13.04.2019 แหล่งที่มา

comment

URL ของคุณผิด รหัสของคุณพยายามขูด macrotrends.net/stocks/charts/ TMO/pe-ratio ไม่ใช่ macrotrends net/stocks/charts/TMO/thermo-fisher-scientific/ ดังนั้นคุณจะได้รับหน้า 404 - Dan-Dev 14.04.2019

คำตอบ (1)

arrow_upward
0
arrow_downward

คุณไม่ได้สร้าง URL ของคุณอย่างถูกต้อง รหัสนี้จะดึงข้อมูลสองตารางหนึ่งตารางสำหรับอเมซอน จากนั้นตารางถัดไปสำหรับเทอร์โมฟิชเชอร์วิทยาศาสตร์

import lxml
from lxml import html
import requests
import pandas as pd

pd.set_option('display.expand_frame_repr', False)

def scrape_table(url):
    # Fetch the page that we're going to parse
    page = requests.get(url)
    tree = html.fromstring(page.content)
    tables = tree.findall('.//*/table')
    df = pd.read_html(lxml.etree.tostring(tables[0], method='html'))[0]
    return df


for symbol in ['AMZN/amazon', 'TMO/thermo-fisher-scientific']:
    url = "https://www.macrotrends.net/stocks/charts/" + symbol + "/pe-ratio"
    data = requests.request("GET", url)
    url_completo = data.url
    print(url_completo)
    df_pe = scrape_table(url_completo)
    print(df_pe)

เอาท์พุต:

   Amazon PE Ratio Historical Data                                 
                              Date Stock Price TTM Net EPS PE Ratio
0                       2019-04-12     1843.06         NaN    91.56
1                       2018-12-31     1501.97      $20.13    74.61
2                       2018-09-30     2003.00      $17.84   112.28
...
   Thermo Fisher Scientific PE Ratio Historical Data                                 
                                                Date Stock Price TTM Net EPS PE Ratio
0                                         2019-04-12      280.65         NaN    38.71
1                                         2018-12-31      223.79       $7.25    30.87
2                                         2018-09-30      243.90       $6.33    38.53
...

Dan-Dev 13.04.2019

ไม่สามารถขูดเว็บด้วยหลายตารางด้วย python lxml

คำตอบ (1)

คำถามในหัวข้อ