尝试使用 request-html (Python 3.6) 抓取 JS 网页时出现问题
2019-11-23
1032
上周我一直在尝试从 Epic Games Store 网页 ( https://www.epicgames.com/store/en-US/ ) 抓取信息,我首先尝试使用 Requests 模块,但很快意识到我需要一个支持 javascript 网页的模块。 这就是我现在正在尝试的,但有一个问题... 当我在页面上使用“检查元素”时,一切都很好,但是当我执行此操作时:
from requests_html import HTMLSession
session = HTMLSession()
r = session.get("https://www.epicgames.com/store/en-US/")
r.html.render()
print(r.html.html)
结果是一个无法读取的 html 文件,大多数元素都没有加载。 结果: https://pastebin.com/zQ9m1gr2
您可以测试一下,从网络上选择一个游戏,然后按 ctrl + f 在结果文件中选择它的名称。您会发现没有匹配项。 我能做什么?
提前谢谢您!:)
编辑: 当我手动从浏览器下载 HTML 时,也会发生同样的情况。
1个回答
因此,主页不包含您要查找的数据意味着,之后会收到商店数据。因此,我们可以使用
requests
来模拟浏览器的操作来获取数据。
如果您查看开发人员工具中的网络选项卡,您将看到当页面加载时,它会从
graphql
端点接收商店数据。这意味着如果您模拟请求,您可以获取商店数据:
import requests
endpoint = "https://graphql.epicgames.com/graphql"
# This query thing is what was sent to the server
# when loading the page, I couldn't figure out how
# to write it ourselves so I basically copy pasted
# the binary data in the payload.
query = b'{"query":"\\n query storefrontDiscoverQuery(\\n $locale:String,\\n $country:String\u0021\\n ) {\\n Storefront {\\n storefrontModules(locale: $locale) {\\n ... on StorefrontBreaker {\\n type\\n title\\n titleGroup\\n description\\n backgroundColors\\n layout\\n link {\\n src\\n linkText\\n }\\n image {\\n src\\n alt\\n }\\n }\\n ... on StorefrontFreeGames {\\n type\\n title\\n }\\n ... on StorefrontCardGroup {\\n type\\n title\\n link {\\n src\\n linkText\\n }\\n offers {\\n namespace\\n id\\n offer {\\n \\n title\\n id\\n namespace\\n description\\n keyImages {\\n type\\n url\\n }\\n seller {\\n id\\n name\\n }\\n urlSlug\\n items {\\n id\\n namespace\\n }\\n customAttributes {\\n key\\n value\\n }\\n categories {\\n path\\n }\\n price(country: $country) {\\n totalPrice {\\n discountPrice\\n originalPrice\\n voucherDiscount\\n discount\\n fmtPrice(locale: $locale) {\\n originalPrice\\n discountPrice\\n intermediatePrice\\n }\\n }\\n lineOffers {\\n appliedRules {\\n id\\n endDate\\n }\\n }\\n }\\n linkedOfferId\\n linkedOffer {\\n effectiveDate\\n customAttributes {\\n key\\n value\\n }\\n }\\n \\n }\\n }\\n }\\n ... on StorefrontFeaturedCarousel {\\n type\\n title\\n slides {\\n title\\n eyebrow\\n description\\n backgroundColor\\n image {\\n src\\n alt\\n }\\n mobileImage {\\n src\\n alt\\n }\\n link {\\n src\\n linkText\\n }\\n }\\n }\\n ... on StorefrontTiles {\\n type\\n title\\n tiles {\\n label\\n genre\\n link {\\n src\\n linkText\\n }\\n }\\n }\\n }\\n }\\n }\\n ","variables":{"locale":"en-US","country":"US"}}'
data = requests.post(endpoint, headers={"Content-type": "application/json;charset=UTF-8"
}, data=query)
print(data.json())
它为我们提供了 此数据 。 (小心,它相当大。)
您还可以使用以下方法获取每个产品的信息:
import requests, json
endpoint = "https://graphql.epicgames.com/graphql"
query = {
"query": "\n query catalogQuery(\n $productNamespace:String!,\n $offerId:String!,\n $locale:String,\n $country:String!,\n $lineOffers: [LineOfferReq]!) {\n Catalog {\n catalogOffer(namespace: $productNamespace,\n id: $offerId,\n locale: $locale) {\n namespace\n effectiveDate\n id\n customAttributes {\n key\n value\n }\n items {\n id\n status\n customAttributes {\n key\n value\n }\n }\n }\n }\n PriceEngine {\n price(country: $country, lineOffers: $lineOffers) {\n totalPrice {\n discountPrice\n originalPrice\n voucherDiscount\n discount\n currencyCode\n currencyInfo {\n decimals\n }\n fmtPrice(locale: $locale) {\n originalPrice\n discountPrice\n intermediatePrice\n }\n }\n lineOffers {\n appliedRules {\n endDate\n discountSetting {\n discountType\n }\n }\n }\n }\n }\n }\n ",
"variables": {
"productNamespace": "cosmos",
"offerId": "1c55202badfc4212b4f82553d5d22c3e", # This is found in the first request we made,
"locale": "en-US", # data.Storefront.storefrontModules[1].offers[""0""].id to be more precise.
"country": "US",
"lineOffers": [{
"offerId": "1c55202badfc4212b4f82553d5d22c3e", # The same id goes here too.
"quantity": 1
}],
"calculateTax": False}
}
data = requests.post(endpoint, headers={"Content-type": "application/json;charset=UTF-8"
}, data=json.dumps(query)) # We added json.dumps because it basically turns dictionary
# into JSON string.
print(data.json())
这给了我们:
{
"data": {
"Catalog": {
"catalogOffer": {
"namespace": "cosmos",
"effectiveDate": "2019-07-12T00:00:00.000Z",
"id": "1c55202badfc4212b4f82553d5d22c3e",
"customAttributes": [
{
"key": "com.epicgames.app.blacklist",
"value": "KR"
},
{
"key": "isPrepurchase",
"value": "true"
},
{
"key": "availableDate",
"value": "1573570800"
},
{
"key": "developerName",
"value": "Human Head Studios, Inc."
}
],
"items": [
{
"id": "70c30983cf0948e4bffc23505f232b11",
"status": "ACTIVE",
"customAttributes": [
{
"key": "SupportedPlatforms",
"value": "Windows"
}
]
},
{
"id": "974e25b4bce6425d9af79cd5ffd64152",
"status": "ACTIVE",
"customAttributes": [
{
"key": "SupportedPlatforms",
"value": "Windows"
}
]
},
{
"id": "159d92ebec254ecf8373709a99388a62",
"status": "ACTIVE",
"customAttributes": [
{
"key": "SupportedPlatforms",
"value": "Windows"
}
]
},
{
"id": "cc67628ab455419cb3d4ecc907febbb7",
"status": "ACTIVE",
"customAttributes": [
{
"key": "SupportedPlatforms",
"value": "Windows"
}
]
},
{
"id": "2f742aa604a441d1a145f70411e9d8d2",
"status": "ACTIVE",
"customAttributes": [
{
"key": "SupportedPlatforms",
"value": "Windows"
}
]
}
]
}
},
"PriceEngine": {
"price": {
"totalPrice": {
"discountPrice": 2999,
"originalPrice": 2999,
"voucherDiscount": 0,
"discount": 0,
"currencyCode": "USD",
"currencyInfo": {
"decimals": 2
},
"fmtPrice": {
"originalPrice": "$29.99",
"discountPrice": "$29.99",
"intermediatePrice": "$29.99"
}
},
"lineOffers": [
{
"appliedRules": []
}
]
}
}
},
"extensions": {
"cacheControl": {
"version": 1,
"hints": [
{
"path": [
"Catalog"
],
"maxAge": 0
},
{
"path": [
"Catalog",
"catalogOffer"
],
"maxAge": 0
},
{
"path": [
"PriceEngine"
],
"maxAge": 0
},
{
"path": [
"PriceEngine",
"price"
],
"maxAge": 0
},
{
"path": [
"Catalog",
"catalogOffer",
"customAttributes"
],
"maxAge": 0
},
{
"path": [
"Catalog",
"catalogOffer",
"items"
],
"maxAge": 0
},
{
"path": [
"Catalog",
"catalogOffer",
"items",
0,
"customAttributes"
],
"maxAge": 0
},
{
"path": [
"Catalog",
"catalogOffer",
"items",
1,
"customAttributes"
],
"maxAge": 0
},
{
"path": [
"Catalog",
"catalogOffer",
"items",
2,
"customAttributes"
],
"maxAge": 0
},
{
"path": [
"Catalog",
"catalogOffer",
"items",
3,
"customAttributes"
],
"maxAge": 0
},
{
"path": [
"Catalog",
"catalogOffer",
"items",
4,
"customAttributes"
],
"maxAge": 0
},
{
"path": [
"PriceEngine",
"price",
"totalPrice"
],
"maxAge": 0
},
{
"path": [
"PriceEngine",
"price",
"totalPrice",
"currencyInfo"
],
"maxAge": 0
},
{
"path": [
"PriceEngine",
"price",
"totalPrice",
"fmtPrice"
],
"maxAge": 0
},
{
"path": [
"PriceEngine",
"price",
"lineOffers"
],
"maxAge": 0
},
{
"path": [
"PriceEngine",
"price",
"lineOffers",
0,
"appliedRules"
],
"maxAge": 0
}
]
}
}
}
显然,您可以从 此网址 获取免费游戏集合 ID。然后您可以使用此 ID 进行查询以获取游戏列表:
import requests, json
endpoint = "https://graphql.epicgames.com/graphql"
gamesCollectionQuery = {
"query":"\n query catalogQuery($productNamespace:String!, $offerId:String!, $locale:String, $country:String!) {\n Catalog {\n catalogOffer(namespace: $productNamespace, id: $offerId, locale: $locale) {\n title\n collectionOffers {\n \n title\n id\n namespace\n description\n keyImages {\n type\n url\n }\n seller {\n id\n name\n }\n urlSlug\n items {\n id\n namespace\n }\n customAttributes {\n key\n value\n }\n categories {\n path\n }\n price(country: $country) {\n totalPrice {\n discountPrice\n originalPrice\n voucherDiscount\n discount\n fmtPrice(locale: $locale) {\n originalPrice\n discountPrice\n intermediatePrice\n }\n }\n lineOffers {\n appliedRules {\n id\n endDate\n }\n }\n }\n linkedOfferId\n linkedOffer {\n effectiveDate\n customAttributes {\n key\n value\n }\n }\n \n }\n customAttributes {\n key\n value\n }\n }\n }\n }\n ",
"variables":{
"productNamespace":"epic",
"offerId":"7f22b3b15abc4821bba634340e2dd1ef",
"locale":"es-ES",
"country":"EN"
}
}
data = requests.post(endpoint, headers={"Content-type": "application/json;charset=UTF-8"
}, data=json.dumps(gamesCollectionQuery))
print(data.content)
Guven Degirmenci
2019-11-23