Saya melakukan pengikisan web menggunakan beautifulsoup dengan python 3.7. Kode di bawah ini berhasil menggores tanggal, judul, tag tetapi tidak isi artikelnya. Sebaliknya, ia memberikan None.
import time
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
url = 'https://www.thehindu.com/search/?q=cybersecurity&order=DESC&sort=publishdate&ct=text&page={}'
pages = 32
for page in range(4, pages+1):
res = requests.get(url.format(page))
soup = BeautifulSoup(res.text,"lxml")
for item in soup.find_all("a", {"class": "story-card75x1-text"}, href=True):
_href = item.get("href")
try:
resp = requests.get(_href)
except Exception as e:
try:
resp = requests.get("https://www.thehindu.com"+_href)
except Exception as e:
continue
dateTag = soup.find("span", {"class": "dateline"})
sauce = BeautifulSoup(resp.text,"lxml")
tag = sauce.find("a", {"class": "section-name"})
titleTag = sauce.find("h1", {"class": "title"})
contentTag = sauce.find("div", {"class": "_yeti_done"})
date = None
tagName = None
title = None
content = None
if isinstance(dateTag,Tag):
date = dateTag.get_text().strip()
if isinstance(tag,Tag):
tagName = tag.get_text().strip()
if isinstance(titleTag,Tag):
title = titleTag.get_text().strip()
if isinstance(contentTag,Tag):
content = contentTag.get_text().strip()
print(f'{date}\n {tagName}\n {title}\n {content}\n')
time.sleep(3)
Saya tidak melihat di mana masalahnya karena saya menulis kelas yang benar di contentTag.
Terima kasih.