mirror of
https://github.com/NaiboWang/EasySpider.git
synced 2025-04-20 04:39:57 +08:00
44 lines
1.1 KiB
Python
44 lines
1.1 KiB
Python
# from lxml import etree
|
|
|
|
# # 解析HTML
|
|
# html = """
|
|
# <div>
|
|
# 123
|
|
# <ul class="list">
|
|
# <li class="item-0">first item</li>
|
|
# <li class="item-1"><a href="link2.html">second item</a></li>
|
|
# </ul>
|
|
# 456
|
|
# <div></div>
|
|
# 789
|
|
# </div>
|
|
# """
|
|
# html = etree.HTML(html)
|
|
# element = html.xpath("*")
|
|
# direct_text = "/html/body/" + html[0][0].tag + "/text()"
|
|
# all_text = "/html/body/" + html[0][0].tag + "//text()"
|
|
# # 使用XPath选择元素
|
|
# results = html.xpath(direct_text)
|
|
# # print(results)
|
|
# # 拼接所有文本内容并去掉两边的空白
|
|
# text = ' '.join(result.strip() for result in results if result.strip())
|
|
|
|
# # 输出结果
|
|
# print(text)
|
|
|
|
# results = html.xpath(all_text)
|
|
# # print(results)
|
|
# # 拼接所有文本内容并去掉两边的空白
|
|
# text = ' '.join(result.strip() for result in results if result.strip())
|
|
|
|
# # 输出结果
|
|
# print(text)
|
|
|
|
import re
|
|
|
|
def lowercase_xpath_tags(xpath):
|
|
return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath)
|
|
|
|
print(lowercase_xpath_tags('//DIV[@id="J_recommendGoods"]/DIV[2]/UL'))
|
|
print("//strong//span[contains(@class,'page-item_M4MDr')]/..//following-sibling::a[1]")
|
|
print("") |