相关文章推荐
乐观的火锅  ·  'webpack-dev-server' 不 ...·  1 年前    · 
任性的抽屉  ·  gson抛出MalformedJsonExc ...·  1 年前    · 
活泼的红金鱼  ·  Jmeter report giving ...·  1 年前    · 
Scootchen_CSDN 2022-10-27 20:34 采纳率: 50%
浏览 20

未使用 import语句from lxml import etree ?

之前都没什么问题,突然就来一个未使用 import语句from lxml import etree都报错,这里为什么呢,有没有朋友帮忙看看那里错了?

/Users/chenyuhui/Desktop/截屏2022-10-27 20.33.17.png

import requests
import re
from lxml import etree
import csv
import time
import asyncio
import aiohttp
import logging
import pandas as pd
import numpy as np
import random
import ssl
import certifi
ssl_context = ssl.create_default_context()
ssl_context.load_verify_locations(certifi.where())
URL1='https://movie.douban.com/subject/35131346/comments?status=P'
file_name='短评-{index}.csv'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
    'cookie': 'll="108298"; bid=bQ20QaMu-Hc; push_noty_num=0; push_doumail_num=0; __utmv=30149280.12857; ct=y; gr_user_id=15663cd4-0204-441c-af43-e8b1a7d2d5f2; __gads=ID=4ed9362f7037ff69-22a328ce7fd70077:T=1666667493:RT=1666667493:S=ALNI_Mb0n4LajrFJVVuT5rLrXwGQO4aPfw; __utmz=30149280.1666760296.17.6.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/128572766/; ap_v=0,6.0; __gpi=UID=00000b6c11ffabc2:T=1666667493:RT=1666831710:S=ALNI_MbUWI8jPIZJO0A34AwiFDfTsDqfyQ; __utma=30149280.1432797394.1664965576.1666830229.1666834156.23; __utmc=30149280; __utmt=1; dbcl2="128572766:obLNF0XC3Tw"; ck=61Xh; __utmb=30149280.17.10.1666834156'
#定义了logging的基本配置
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s: %(message)s')
start=time.time()
URL2=URL1[0:51]+'start={start}&limit=20&status=P&sort=new_score'
URL3='https://www.douban.com/people/{authorid}/'
CONCURRENCY = 5
semaphore = asyncio.Semaphore(CONCURRENCY)
with open(file_name.format(index=re.findall('subject/(.*?)/comments',URL1)[0]), 'w',encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['序号','评论者', '评论', 'IP', '常驻地','有用量','网址'])
df=pd.DataFrame(pd.read_excel('./代理IP.xlsx'))
IPPORTS=list(df.loc[:,"IPPORTS"])
proxys = np.unique([IPPORTS for IPPORT in IPPORTS])
async def scrape_api(url):
    async with semaphore:
        try:
            logging.info('scraping %s', url)
            proxy=random.choice(proxys)
            proxies = {
                'http': 'http://' + proxy,
            session.proxies=proxies
            async with session.get(url,headers=headers,ssl=ssl_context) as response:
                return await response.text()
        except aiohttp.ClientError:
            logging.error('error occurred while scraping %s', url, exc_info=True)
async def scrape_index(page):
    url = URL2.format(start=page)
    return await scrape_api(url)
async def scrape_detail(authorid):
    url = URL3.format(authorid=authorid)
    return await scrape_api(url)
async def main():
    global session
    session = aiohttp.ClientSession()
    r1=requests.get(URL1,headers=headers)
    web1=r1.text
    # print(r1.status_code)
    print(web1)
    review_counts=int(re.findall('看过\((.*?)\)',web1)[0])
    print("共{}条短评".format(review_counts))
    scrape_index_tasks = [asyncio.ensure_future(scrape_index(page)) for page in range(0,review_counts//20,20)]
    results = await asyncio.gather(*scrape_index_tasks)
    for page,web2 in enumerate(results):
        reviews=re.findall('(.*?)', web2)
        youyongs=re.findall('(.*?)', web2)
        authorids=re.findall('https://www.douban.com/people/(.*?)/" class="">',web2)
        authors = re.findall('class="">(.*?)', web2)
        for i,[review,youyong,author,authorid] in enumerate(zip(reviews,youyongs,authors,authorids)):
            with open(file_name.format(index=re.findall('subject/(.*?)/comments', URL1)[0]), 'a',encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow([i+1+page*20, author, review,"/","/",youyong,URL3.format(authorid=authorid)])
if __name__ == '__main__':
    loop=asyncio.get_event_loop()
    loop.run_until_complete(main())
    end=time.time()
    print("用时{}s".format(end-start))
#     print(reviews_sum)
#     print(authorhrefs)
#     print(levels)
#     print(len(reviews_sum))
#     print(len(authorhrefs))
#     print(len(levels))
# for authorhref in authorhrefs:
       
  • 编辑 收藏 删除 结题
  • 追加酬金 (90%的用户在追加酬金后获得了解决方案)

    当前问题酬金

    ¥ 0 (可追加 ¥500)

    支付方式

    余额支付

    余额: ¥ 499

    扫码支付

    提供问题酬金的用户不参与问题酬金结算和分配

    支付即为同意 《付费问题酬金结算规则》

    1 条回答 默认 最新

    查看更多回答(-1条) 报告相同问题?

    问题事件

    • 创建了问题 10月27日

    悬赏问题

    • ¥30 (问卷调查)莫名其妙丢了u盘,你们会是什么心理状态 Spark+android应⽤案例 yolov8 目标检测 重叠 遮挡 微信聊天记录如何部署到服务器上 vscode系统开发 求解,老毛子skokina是什么东西,是否存在 git回滚后怎么再恢复 轴承故障诊断,CDAE之后加傅里叶变换FFT,然后输入到BiLSTM中去,这个傅里叶变换该怎么加。