`
ssydxa219
  • 浏览: 627021 次
  • 性别: Icon_minigender_1
  • 来自: 杭州
文章分类
社区版块
存档分类
最新评论

pysssssssssss

 
阅读更多
# -*- coding:utf-8 -*-
from urllib import request as urllib2
from urllib import parse
from bs4 import BeautifulSoup as bs
import json
import codecs

url = r'https://movie.douban.com/subject/26363254/'
outputjsonfile= "/ddhome/data/movie.json"

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'
}


def DelLastChar(str):
    str_list=list(str)
    str_list.pop()
    return "".join(str_list)

MovieID ="26363254"
data = parse.urlencode(formData).encode('utf-8')
request = urllib2.Request(url=url, data=data, headers=headers)
response = urllib2.urlopen(request)
html_data = response.read().decode('utf-8')
soup = bs(html_data, 'html.parser')
title = soup.title.string
movie_name= soup.find("span", {"property": "v:itemreviewed"}).text
year = soup.select('.year')[0].text
score = soup.find("strong", {"property": "v:average"}).text
score_num = soup.find("span", {"property": "v:votes"}).text
directeridstr = soup.find("a", {"rel": "v:directedBy"})
directerid = directeridstr.get("href").replace("/celebrity/", "").replace("/", "")
directername = soup.find("a", {"rel": "v:directedBy"}).text
movie_type = soup.find("span", {"property": "v:genre"}).text

attrs = soup.select('.attrs > a')#[0].text
actors = soup.select('.actor .attrs > a')
actorstr = ""
for actor in actors:
    actorId = actor.get("href").replace("/celebrity/", "").replace("/", "")
    actorName = actor.text + " "
    actorstr +=  "{ 'StarID' : '" + actorId + "' , 'starName' : '"+ actorName + "' } ,"
   
actorstrres =[ DelLastChar(actorstr) ]
json_data = {'MovieID': MovieID ,'MovieName':movie_name ,
             'MovieYear':year ,'MovieScore':score ,
             'VoteNumber':score_num ,'VoteNumber':score_num ,
             'MovieType':movie_type,'Director':[{'StarID':directerid ,'starName':directername}],
             'Scenarist':[],
             'Actor':actorstrres
            
             }
with codecs.open(outputjsonfile,"w","utf-8") as f:
    json.dump(json_data,f,ensure_ascii=False,indent=4)
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics