# 导入爬虫库
import requests
# 导入pyquery(数据提取)
from pyquery import PyQuery as pq
# 用于延时请求
import time
# 请求头
headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.9 Safari/537.36',
'Cookie':'aliyungf_tc=AQAAACrtMHyGHA4ARxkbZ27Kgw3kCofh; route=ac205598b1fccbab08a64956374e0f11; JSESSIONID=5B42F8C6E712092B9A963E3F0532AD21; uuid=9065c880-0293-4758-86a8-0a228c6cfb2c; SERVERID=srv-omp-ali-portal10_80; Hm_lvt_94a1e06bbce219d29285cee2e37d1d26=1587280903; Hm_lpvt_94a1e06bbce219d29285cee2e37d1d26=1587280903; UM_distinctid=17191507d62338-03d1defec13f5f-721f3a40-144000-17191507d63400; CNZZDATA1261102524=262517629-1587279306-null%7C1587279306; __ads_session=6NY9VLMBdgmIzmsFHgA=',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Host':'www.thepaper.cn'}
# 封装一个获取新闻内容的函数
def get_news(href,title):
# 对新闻内容网址发送请求
response = requests.get(href,headers=headers).text
# 数据初始化
doc =pq(response)
# 通过类选择器news_txt提取新闻内容
news = doc(".index_cententWrap__Jv8jK").items()
# 遍历数据
for x in news:
# 取出数据中的文本数据,获取到新闻信息
new = x.text()
print(new)
# 封装一个获取新闻内容网址和新闻标题的函数
def get_news_title():
# 新闻网首页的链接
url = 'https://www.thepaper.cn/channel_25951'
# 对首页发送请求,并返回文本数据
respoonse = requests.get(url, headers=headers,).text
time.sleep(1)
# 数据初始化
doc = pq(respoonse)
# 通过类选择器news_li 下级标签 h2 a 定位数据
# .itens把数据变成可遍历的数据
a = doc(".small_toplink__GmZhY a").items()
print(a)
# 遍历数据
for x in a:
# 通过属性href提取出新闻网址
href = "https://www.thepaper.cn/" + x.attr("href")
print(href)
# 提取数据中的文本 获取新闻标题
title = x.text()
print(title)
get_news(href,title)
if __name__ == '__main__':
get_news_title()