import requests,lxml
from bs4 import BeautifulSoup
import csv,jieba,wordcloud,paddle
from wordcloud import WordCloud, ImageColorGenerator
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
results =[]
for url in urls:
res = requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'})
soup = BeautifulSoup(res.text,'lxml')
for i in soup.find_all("h3"):
result = {
"title":i.a["aria-label"][3:],
"url":i.a["href"]
results.append(result)
print(result)
except:
print(url)
def get_website_info(url):
result = []
p =[]
Aresult = []
website=url.split('/')[2]
target_web = ['baijiahao.baidu.com','new.qq.com','mp.weixin.qq.com','news.sohu.com','www.sohu.com']
if website in target_web:
res = requests.get(url,headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.70'})
soup = BeautifulSoup(res.text,'lxml')
if website == "baijiahao.baidu.com":
p = soup.find_all("p")
elif website == "new.qq.com":
p = soup.find_all(class_="one-p")
elif website == "mp.weixin.qq.com":
p = soup.find("div", {"id": "js_content"})
elif website == "news.sohu.com" or "www.sohu.com":
p = soup.find_all("article",class_="article")
for i in p:
if i.text != "":
result.append(i.text)
Aresult = "".join(result)
except:
return result
return Aresult
由于抓取的内容是文字信息,且所在的网站无明显反爬策略,所以只需要找出文章所在的位置即可,方法跟第三小节中的一样,只需要找出相关的 html 标签即可。
百家号的文章存在于p标签内;腾讯新闻的文章存在于 class 为
one-p
的
p
标签内;微信公众号的文章存在于
id
为
js_content
的
div
标签内;搜狐有两个相关的网站,但是网页结构都是一样的,文章存在于 class 为
article
的
p
标签内。
此处只指定了数量排名靠前的网站的抓取策略,其余的网站会自动过滤并返回空信息。
6.将所有文章写入 csv 文件中
countt = 0
with open("webinfo.csv","w",newline="",encoding="utf-8-sig") as f:
writer = csv.writer(f)
writer.writerow(["title","url","content"])
for i in results:
c = get_website_info(i["url"])
time.sleep(2)
if c != "":
content = c
w = {
"title":i["title"],
"url":i["url"],
"content":c
writer.writerow([w["title"],w["url"],w["content"]])
countt += 1
if countt % 10 == 0:
print("已完成{}条".format(countt))
f.close()
countt = 0
with open("webinfo.csv","w",newline="",encoding="utf-8-sig") as f:
writer = csv.writer(f)
# writer.writerow(["title","url","content"])
for i in results:
c = get_website_info(i["url"])
time.sleep(2)
if c != "":
# content = c
# w = {
# "title":i["title"],
# "url":i["url"],
# "content":c
# }
# writer.writerow([w["title"],w["url"],w["content"]])
writer.writerow(c)
countt += 1
if countt % 10 == 0:
print("已完成{}条".format(countt))
f.close()
其中
#
代表注释,即运行时不执行此行代码。
写入数据时,
open
方法里的
w
参数指的是写入,纯写入,即打开文件后,不管文件里之前有没有数据,都会从 0 开始写入数据。这里因为只写入一次,所以用的
w
模式,如果需要多次写入,可以用
a
,即“追加写入”模式。
#读取 webinfo.csv
p_list = []
with open("webinfo.csv","r",encoding="utf-8-sig") as f:
reader = csv.reader(f)
for i in reader:
if i[0] == "title":
continue
else:
p_list.append(i[2])
f.close()
#将文章分词
p_list_2 = []
paddle.enable_static()
jieba.enable_paddle()
for i in p_list:
seg_list = jieba.cut(i, cut_all=False)#精确模式,即尽可能保留名词
p_list_2.append(" ".join(seg_list))
#读取停用词并删除
with open("baidu_stopwords.txt","r",encoding="utf-8-sig") as stopwords:
stop_words = [i.strip() for i in stopwords.readlines()]
data_1 = ''
for i in p_list_2:
for j in i:
if j not in stop_words:
data_1 += j
首先需要读取上一小节的 csv 文件,在
open
方法中,之前写入时用的是
w
,这里需要用到
r
,即 read。由于上一节在写入文件时候,是结构化写入,所以在读取的时候,也需要层层读取。第一步用
csv.reader
生成一个可读取的对象,第二步开始读取上述 csv 文件,先省略第一行,从第二行开始读取,因为每一行的结构都是标题、网址、文章内容,所以读的时候只需要读每一行的第 3 个元素就行了。然后把所有文章添加到一个列表中。