使用 Python 和 selenium 抓取 URL
我正在尝试让一个 python selenium 脚本运行,该脚本应该执行以下操作:
Take text file, BookTitle.txt that is a list of Book Titles.
Using Python/Selenium then searches the site, GoodReads.com for that title.
Takes the URL for the result and makes a new .CSV file with column 1=book title and column 2=Site URL
I hope that we can get this working, then please help me with step by step to get it to run.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.options import Options
from pyvirtualdisplay import Display
#from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common import keys
import csv
import time
import json
class Book:
def __init__(self, title, url):
self.title = title
self.url = url
def __iter__(self):
return iter([self.title, self.url])
url = 'https://www.goodreads.com/'
def create_csv_file():
header = ['Title', 'URL']
with open('/home/l/gDrive/AudioBookReviews/WebScraping/GoodReadsBooksNew.csv', 'w+', encoding='utf-8') as csv_file:
wr = csv.writer(csv_file, delimiter=',')
wr.writerow(header)
def read_from_txt_file():
lines = [line.rstrip('\n') for line in open('/home/l/gDrive/AudioBookReviews/WebScraping/BookTitles.txt', encoding='utf-8')]
return lines
def init_selenium():
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
options = Options()
options.add_argument('--headless')
global driver
driver = webdriver.Chrome("/home/l/gDrive/AudioBookReviews/WebScraping/chromedriver", chrome_options=chrome_options)
driver.get(url)
time.sleep(30)
driver.get('https://www.goodreads.com/search?q=')
def search_for_title(title):
search_field = driver.find_element_by_xpath('//*[@id="search_query_main"]')
search_field.clear()
search_field.send_keys(title)
search_button = driver.find_element_by_xpath('/html/body/div[2]/div[3]/div[1]/div[1]/div[2]/form/div[1]/input[3]')
search_button.click()
def scrape_url():
try:
url = driver.find_element_by_css_selector('a.bookTitle').get_attribute('href')
except:
url = "N/A"
return url
def write_into_csv_file(vendor):
with open('/home/l/gDrive/AudioBookReviews/WebScraping/GoodReadsBooksNew.csv', 'a', encoding='utf-8') as csv_file:
wr = csv.writer(csv_file, delimiter=',')
wr.writerow(list(vendor))
create_csv_file()
titles = read_from_txt_file()
init_selenium()
for title in titles:
search_for_title(title)
url = scrape_url()
book = Book(title, url)
write_into_csv_file(book)
运行上述脚本,我收到以下错误:
Traceback (most recent call last): File "/home/l/gDrive/AudioBookReviews/WebScraping/GoodreadsScraper.py", line 68, in init_selenium() File "/home/l/gDrive/AudioBookReviews/WebScraping/GoodreadsScraper.py", line 41, in init_selenium driver = webdriver.Chrome("/home/l/gDrive/AudioBookReviews/WebScraping/chromedriver", chrome_options=chrome_options) File "/usr/local/lib/python3.6/dist-packages/selenium/webdriver/chrome/webdriver.py", line 81, in init desired_capabilities=desired_capabilities) File "/usr/local/lib/python3.6/dist-packages/selenium/webdriver/remote/webdriver.py", line 157, in init self.start_session(capabilities, browser_profile) File "/usr/local/lib/python3.6/dist-packages/selenium/webdriver/remote/webdriver.py", line 252, in start_session response = self.execute(Command.NEW_SESSION, parameters) File "/usr/local/lib/python3.6/dist-packages/selenium/webdriver/remote/webdriver.py", line 321, in execute self.error_handler.check_response(response) File "/usr/local/lib/python3.6/dist-packages/selenium/webdriver/remote/errorhandler.py", line 242, in check_response raise exception_class(message, screen, stacktrace) selenium.common.exceptions.WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally (unknown error: DevToolsActivePort file doesn't exist) (The process started from chrome location /usr/bin/google-chrome is no longer running, so ChromeDriver is assuming that Chrome has crashed.) (Driver info: chromedriver=2.44.609551 (5d576e9a44fe4c5b6a07e568f1ebc753f1214634),platform=Linux 4.15.0-60-generic x86_64)
目前我可以看到几个错误:
1) 您必须取消注释 chrome options 并注释 firefox',因为您稍后会在代码中传递 chromedriver
# from selenium.webdriver.firefox.options import Options
from selenium.webdriver.chrome.options import Options
顺便说一句,pyvirtualdisplay 是 headless chrome 的替代方案,您不需要导入它。
2) 您已实例化 Options 两次,并且只使用了第一个。将您的代码更改为:
def init_selenium():
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--headless')
我猜这两个只是开始,当您遇到下一个无法解决的问题时,请编辑您的问题。
您正在使用 chrome 驱动程序,但在导入时将其注释掉。
from selenium.webdriver.chrome.options import Options
在搜索功能中,流程为: 获取页面 -> 查找搜索框 -> 输入值 -> 输入键 -> 获取结果。
类似以下内容:
def search_for_title(title):
driver.get('https://www.goodreads.com/search?q=')
search_field = driver.find_element_by_name('q')
search_field.clear()
search_field.send_keys(title)
search_field.send_keys(keys.Keys.RETURN) # you missed this part
url = driver.find_element_by_xpath(
'/html/body/div[2]/div[3]/div[1]/div[2]/div[2]/table/tbody/tr[1]/td[2]/a')
print(url.get_attribute('href'))