header = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Edg/92.0.902.78" }
# 发起 Get 请求并取 text(html) html = requests.get("https://www.hk01.com/%E7%AA%81%E7%99%BC/669730/%E9%A6%AC%E6%96%99%E6%B0%B4%E5%BE%80%E6%9D%B1%E5%B9%B3%E6%B4%B2%E6%B8%A1%E8%BC%AA%E5%86%92%E7%85%99-320%E4%B9%98%E5%AE%A2%E5%8F%8A%E8%88%B9%E5%93%A1%E9%A0%86%E5%88%A9%E7%96%8F%E6%95%A3").text
# 发起 Get 请求并取 text(html) html = requests.get("https://www.hk01.com/%E7%AA%81%E7%99%BC/669730/%E9%A6%AC%E6%96%99%E6%B0%B4%E5%BE%80%E6%9D%B1%E5%B9%B3%E6%B4%B2%E6%B8%A1%E8%BC%AA%E5%86%92%E7%85%99-320%E4%B9%98%E5%AE%A2%E5%8F%8A%E8%88%B9%E5%93%A1%E9%A0%86%E5%88%A9%E7%96%8F%E6%95%A3").text
# 正则匹配标题 title = re.compile(r'<h1 id=\"articleTitle\"([\s\S]*?)>([\s\S]*?)\<\/h1\>').findall(html) # 正则匹配撰稿人 author = re.compile(r'<div class=\"text-sm leading-6 text-black-40 mb-0.5 flex flex-row md:pb-0.75\"><span class=\"flex whitespace-nowrap\">撰文:<\/span><span class=\"flex\">([\s\S]*?)<\/span><\/div>').findall(html) # 正则匹配正文内容 text = re.compile(r'<article id=\"article-content-section\" class="cmp-article-detail">([\s\S]*?)<\/article>').findall(html)[0] # 取所有html tag tag = re.compile(r'<([\s\S]*?)>').findall(text) # 循环替换为空 for i in tag: text = text.replace("<"+i+">", "")