爬虫之BeautifulSoup、lxml、requests_html

"""
爬虫
"""
from bs4 import BeautifulSoup
import requests
from lxml import etree

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="https://www.jb51.net" class="sister" id="link1">Elsie</a>,
<a href="https://www.jb51.net" class="sister" id="link2">Lacie</a> and
<a href="https://www.jb51.net" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

# soup = BeautifulSoup(html_doc, features="html.parser")
# print(soup.prettify()) # 格式化输出内容
# print(soup.a)
# print(soup.find_all('a'))
# print(soup.find('a'))
# print(soup.find(id='link3'))
# print(soup.get_text())
# print(soup.find_all('p', attrs={"class": "title"}))

# 获取a标签下的text
# for document in soup.find_all('a', attrs={"class": "sister"}):
#     print(document.text)  # document.get_text()

# Oracle beautifulsoup练习
# res = requests.get('https://docs.oracle.com/cd/E13214_01/wli/docs92/xref/xqisocodes.html')
# bs = BeautifulSoup(res.text, features='html.parser')
# data = []
# result = {}
# for i in bs.find_all('div', attrs={'class': 'pCellBody'}):
#     data.append(i.text.strip().replace('\n', '').replace('\t', ''))
#
# for j in range(0, len(data), 2):
#     result[data[j]] = data[j+1]
#
# print(result)

# Oracle lxml练习
# res = requests.get('https://docs.oracle.com/cd/E13214_01/wli/docs92/xref/xqisocodes.html')
# et = etree.HTML(res.text)
# data = []
# result = {}
# # 传xpath路径
# for i in et.xpath("//table[@id='wp1250799table1250793']//div[@class='pCellBody']/text()"):
#     data.append(i.replace('\n', '').replace('\t', '').strip())
#
# for j in range(0, len(data), 2):
#     result[data[j]] = data[j+1]
# print(result)

# requests_html 爬取数据
from requests_html import HTMLSession
data = []
resulst = {}
session = HTMLSession()
res = session.get('https://docs.oracle.com/cd/E13214_01/wli/docs92/xref/xqisocodes.html')
# print(res.html.links) # 获取html中的所有链接
# print(res.html.absolute_links) # 获取html中所有的外部链接
for i in res.html.xpath("//table[@id='wp1252455table1251750']//div[@class='pCellBody']/text()"):
    data.append(i.strip().replace('\r\n', ''))

for j in range(0, len(data), 2):
    resulst[data[j]] = data[j+1]
print(resulst)